py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,85 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.similarity_measure.token_similarity_measure import \
3
+ TokenSimilarityMeasure
4
+
5
+
6
+ class Dice(TokenSimilarityMeasure):
7
+ """Returns the Dice score between two strings.
8
+
9
+ The Dice similarity score is defined as twice the shared information (intersection) divided by sum of cardinalities.
10
+ For two sets X and Y, the Dice similarity score is:
11
+
12
+ :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`
13
+ """
14
+
15
+ def __init__(self):
16
+ super(Dice, self).__init__()
17
+
18
+ def get_raw_score(self, set1, set2):
19
+ """Computes the raw Dice score between two sets. This score is already in [0,1].
20
+
21
+ Args:
22
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
23
+
24
+ Returns:
25
+ Dice similarity score (float).
26
+
27
+ Raises:
28
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
29
+
30
+ Examples:
31
+ >>> dice = Dice()
32
+ >>> dice.get_raw_score(['data', 'science'], ['data'])
33
+ 0.6666666666666666
34
+ >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
35
+ 0.5454545454545454
36
+ >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
37
+ 0.5
38
+
39
+ References:
40
+ * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
41
+ * SimMetrics library.
42
+ """
43
+
44
+ # input validations
45
+ utils.sim_check_for_none(set1, set2)
46
+ utils.sim_check_for_list_or_set_inputs(set1, set2)
47
+
48
+ # if exact match return 1.0
49
+ if utils.sim_check_for_exact_match(set1, set2):
50
+ return 1.0
51
+
52
+ # if one of the strings is empty return 0
53
+ if utils.sim_check_for_empty(set1, set2):
54
+ return 0
55
+
56
+ if not isinstance(set1, set):
57
+ set1 = set(set1)
58
+ if not isinstance(set2, set):
59
+ set2 = set(set2)
60
+
61
+ return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
62
+
63
+ def get_sim_score(self, set1, set2):
64
+ """Computes the normalized dice similarity score between two sets. Simply call get_raw_score.
65
+
66
+ Args:
67
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
68
+
69
+ Returns:
70
+ Normalized dice similarity (float).
71
+
72
+ Raises:
73
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
74
+
75
+ Examples:
76
+ >>> dice = Dice()
77
+ >>> dice.get_sim_score(['data', 'science'], ['data'])
78
+ 0.6666666666666666
79
+ >>> dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
80
+ 0.5454545454545454
81
+ >>> dice.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
82
+ 0.5
83
+
84
+ """
85
+ return self.get_raw_score(set1, set2)
@@ -0,0 +1,84 @@
1
+ from __future__ import division
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
5
+ SequenceSimilarityMeasure
6
+
7
+
8
+ class HammingDistance(SequenceSimilarityMeasure):
9
+ """Computes Hamming distance.
10
+
11
+ The Hamming distance between two strings of equal length is the number of positions at which the corresponding
12
+ symbols are different. Thus, it measures the minimum number of substitutions required to change
13
+ one string into the other, or the minimum number of errors that could have transformed one string into the other.
14
+ """
15
+
16
+ def __init__(self):
17
+ super(HammingDistance, self).__init__()
18
+
19
+ def get_raw_score(self, string1, string2):
20
+ """Computes the raw hamming distance between two strings.
21
+
22
+ Args:
23
+ string1,string2 (str): Input strings.
24
+
25
+ Returns:
26
+ Hamming distance (int).
27
+
28
+ Raises:
29
+ TypeError : If the inputs are not strings or if one of the inputs is None.
30
+ ValueError : If the input strings are not of same length.
31
+
32
+ Examples:
33
+ >>> hd = HammingDistance()
34
+ >>> hd.get_raw_score('', '')
35
+ 0
36
+ >>> hd.get_raw_score('alex', 'john')
37
+ 4
38
+ >>> hd.get_raw_score(' ', 'a')
39
+ 1
40
+ >>> hd.get_raw_score('JOHN', 'john')
41
+ 4
42
+ """
43
+
44
+ # input validations
45
+ utils.sim_check_for_none(string1, string2)
46
+ utils.tok_check_for_string_input(string1, string2)
47
+
48
+ # for Hamming Distance string length should be same
49
+ utils.sim_check_for_same_len(string1, string2)
50
+
51
+ # sum all the mismatch characters at the corresponding index of
52
+ # input strings
53
+ return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
54
+
55
+ def get_sim_score(self, string1, string2):
56
+ """Computes the normalized Hamming similarity score between two strings.
57
+
58
+ Args:
59
+ string1,string2 (str): Input strings.
60
+
61
+ Returns:
62
+ Normalized Hamming similarity score (float).
63
+
64
+ Raises:
65
+ TypeError : If the inputs are not strings or if one of the inputs is None.
66
+ ValueError : If the input strings are not of same length.
67
+
68
+ Examples:
69
+ >>> hd = HammingDistance()
70
+ >>> hd.get_sim_score('', '')
71
+ 1.0
72
+ >>> hd.get_sim_score('alex', 'john')
73
+ 0.0
74
+ >>> hd.get_sim_score(' ', 'a')
75
+ 0.0
76
+ >>> hd.get_sim_score('JOHN', 'john')
77
+ 0.0
78
+ """
79
+
80
+ raw_score = self.get_raw_score(string1, string2)
81
+ common_len = len(string1)
82
+ if common_len == 0:
83
+ return 1.0
84
+ return 1 - (raw_score / common_len)
@@ -0,0 +1,7 @@
1
+ """Hybrid similarity measure"""
2
+
3
+ from py_stringmatching.similarity_measure.similarity_measure import \
4
+ SimilarityMeasure
5
+
6
+ class HybridSimilarityMeasure(SimilarityMeasure):
7
+ pass
@@ -0,0 +1,79 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.similarity_measure.token_similarity_measure import \
3
+ TokenSimilarityMeasure
4
+
5
+
6
+ class Jaccard(TokenSimilarityMeasure):
7
+ """Computes Jaccard measure.
8
+
9
+ For two sets X and Y, the Jaccard similarity score is:
10
+
11
+ :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
12
+ """
13
+
14
+ def __init__(self):
15
+ super(Jaccard, self).__init__()
16
+
17
+ def get_raw_score(self, set1, set2):
18
+ """Computes the raw Jaccard score between two sets.
19
+
20
+ Args:
21
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
22
+
23
+ Returns:
24
+ Jaccard similarity score (float).
25
+
26
+ Raises:
27
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
28
+
29
+ Examples:
30
+ >>> jac = Jaccard()
31
+ >>> jac.get_raw_score(['data', 'science'], ['data'])
32
+ 0.5
33
+ >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
34
+ 0.375
35
+ >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
36
+ 0.3333333333333333
37
+ """
38
+
39
+ # input validations
40
+ utils.sim_check_for_none(set1, set2)
41
+ utils.sim_check_for_list_or_set_inputs(set1, set2)
42
+
43
+ # if exact match return 1.0
44
+ if utils.sim_check_for_exact_match(set1, set2):
45
+ return 1.0
46
+
47
+ # if one of the strings is empty return 0
48
+ if utils.sim_check_for_empty(set1, set2):
49
+ return 0
50
+
51
+ if not isinstance(set1, set):
52
+ set1 = set(set1)
53
+ if not isinstance(set2, set):
54
+ set2 = set(set2)
55
+
56
+ return float(len(set1 & set2)) / float(len(set1 | set2))
57
+
58
+ def get_sim_score(self, set1, set2):
59
+ """Computes the normalized Jaccard similarity between two sets. Simply call get_raw_score.
60
+
61
+ Args:
62
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
63
+
64
+ Returns:
65
+ Normalized Jaccard similarity (float).
66
+
67
+ Raises:
68
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
69
+
70
+ Examples:
71
+ >>> jac = Jaccard()
72
+ >>> jac.get_sim_score(['data', 'science'], ['data'])
73
+ 0.5
74
+ >>> jac.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
75
+ 0.375
76
+ >>> jac.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
77
+ 0.3333333333333333
78
+ """
79
+ return self.get_raw_score(set1, set2)
@@ -0,0 +1,110 @@
1
+ from py_stringmatching import utils
2
+ from six.moves import xrange
3
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
4
+ SequenceSimilarityMeasure
5
+
6
+
7
+ class Jaro(SequenceSimilarityMeasure):
8
+ """Computes Jaro measure.
9
+
10
+ The Jaro measure is a type of edit distance, developed mainly to compare short strings,
11
+ such as first and last names.
12
+ """
13
+
14
+ def __init__(self):
15
+ super(Jaro, self).__init__()
16
+
17
+ def get_raw_score(self, string1, string2):
18
+ """Computes the raw Jaro score between two strings.
19
+
20
+ Args:
21
+ string1,string2 (str): Input strings.
22
+
23
+ Returns:
24
+ Jaro similarity score (float).
25
+
26
+ Raises:
27
+ TypeError : If the inputs are not strings or if one of the inputs is None.
28
+
29
+ Examples:
30
+ >>> jaro = Jaro()
31
+ >>> jaro.get_raw_score('MARTHA', 'MARHTA')
32
+ 0.9444444444444445
33
+ >>> jaro.get_raw_score('DWAYNE', 'DUANE')
34
+ 0.8222222222222223
35
+ >>> jaro.get_raw_score('DIXON', 'DICKSONX')
36
+ 0.7666666666666666
37
+
38
+ """
39
+
40
+ # input validations
41
+ utils.sim_check_for_none(string1, string2)
42
+ utils.tok_check_for_string_input(string1, string2)
43
+
44
+ # if one of the strings is empty return 0
45
+ if utils.sim_check_for_empty(string1, string2):
46
+ return 0
47
+
48
+ len_s1 = len(string1)
49
+ len_s2 = len(string2)
50
+
51
+ max_len = max(len_s1, len_s2)
52
+ search_range = (max_len // 2) - 1
53
+ if search_range < 0:
54
+ search_range = 0
55
+
56
+ flags_s1 = [False] * len_s1
57
+ flags_s2 = [False] * len_s2
58
+
59
+ common_chars = 0
60
+ for i, ch_s1 in enumerate(string1):
61
+ low = i - search_range if i > search_range else 0
62
+ high = i + search_range if i + search_range < len_s2 else len_s2 - 1
63
+ for j in xrange(low, high + 1):
64
+ if not flags_s2[j] and string2[j] == ch_s1:
65
+ flags_s1[i] = flags_s2[j] = True
66
+ common_chars += 1
67
+ break
68
+
69
+ if not common_chars:
70
+ return 0
71
+
72
+ k = trans_count = 0
73
+ for i, f_s1 in enumerate(flags_s1):
74
+ if f_s1:
75
+ for j in xrange(k, len_s2):
76
+ if flags_s2[j]:
77
+ k = j + 1
78
+ break
79
+ if string1[i] != string2[j]:
80
+ trans_count += 1
81
+
82
+ trans_count /= 2
83
+ common_chars = float(common_chars)
84
+ weight = ((common_chars / len_s1 + common_chars / len_s2 +
85
+ (common_chars - trans_count) / common_chars)) / 3
86
+ return weight
87
+
88
+ def get_sim_score(self, string1, string2):
89
+ """Computes the normalized Jaro similarity score between two strings. Simply call get_raw_score.
90
+
91
+ Args:
92
+ string1,string2 (str): Input strings.
93
+
94
+ Returns:
95
+ Normalized Jaro similarity score (float).
96
+
97
+ Raises:
98
+ TypeError : If the inputs are not strings or if one of the inputs is None.
99
+
100
+ Examples:
101
+ >>> jaro = Jaro()
102
+ >>> jaro.get_sim_score('MARTHA', 'MARHTA')
103
+ 0.9444444444444445
104
+ >>> jaro.get_sim_score('DWAYNE', 'DUANE')
105
+ 0.8222222222222223
106
+ >>> jaro.get_sim_score('DIXON', 'DICKSONX')
107
+ 0.7666666666666666
108
+
109
+ """
110
+ return self.get_raw_score(string1, string2)
@@ -0,0 +1,106 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.similarity_measure.jaro import Jaro
3
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
4
+ SequenceSimilarityMeasure
5
+
6
+
7
+ class JaroWinkler(SequenceSimilarityMeasure):
8
+ """Computes Jaro-Winkler measure.
9
+
10
+ The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match.
11
+
12
+ Args:
13
+ prefix_weight (float): Weight to give to the prefix (defaults to 0.1).
14
+
15
+ Attributes:
16
+ prefix_weight (float): An attribute to store the prefix weight.
17
+ """
18
+
19
+ def __init__(self, prefix_weight=0.1):
20
+ self.prefix_weight = prefix_weight
21
+ super(JaroWinkler, self).__init__()
22
+
23
+ def get_raw_score(self, string1, string2):
24
+ """Computes the raw Jaro-Winkler score between two strings.
25
+
26
+ Args:
27
+ string1,string2 (str): Input strings.
28
+
29
+ Returns:
30
+ Jaro-Winkler similarity score (float).
31
+
32
+ Raises:
33
+ TypeError : If the inputs are not strings or if one of the inputs is None.
34
+
35
+ Examples:
36
+ >>> jw = JaroWinkler()
37
+ >>> jw.get_raw_score('MARTHA', 'MARHTA')
38
+ 0.9611111111111111
39
+ >>> jw.get_raw_score('DWAYNE', 'DUANE')
40
+ 0.84
41
+ >>> jw.get_raw_score('DIXON', 'DICKSONX')
42
+ 0.8133333333333332
43
+
44
+ """
45
+
46
+ # input validations
47
+ utils.sim_check_for_none(string1, string2)
48
+ utils.tok_check_for_string_input(string1, string2)
49
+
50
+ # if one of the strings is empty return 0
51
+ if utils.sim_check_for_empty(string1, string2):
52
+ return 0
53
+
54
+ jw_score = Jaro().get_raw_score(string1, string2)
55
+ min_len = min(len(string1), len(string2))
56
+
57
+ # prefix length can be at max 4
58
+ j = min(min_len, 4)
59
+ i = 0
60
+ while i < j and string1[i] == string2[i] and string1[i]:
61
+ i += 1
62
+
63
+ if i:
64
+ jw_score += i * self.prefix_weight * (1 - jw_score)
65
+
66
+ return jw_score
67
+
68
+ def get_sim_score(self, string1, string2):
69
+ """Computes the normalized Jaro-Winkler similarity score between two strings. Simply call get_raw_score.
70
+
71
+ Args:
72
+ string1,string2 (str): Input strings.
73
+
74
+ Returns:
75
+ Normalized Jaro-Winkler similarity (float).
76
+
77
+ Raises:
78
+ TypeError : If the inputs are not strings or if one of the inputs is None.
79
+
80
+ Examples:
81
+ >>> jw = JaroWinkler()
82
+ >>> jw.get_sim_score('MARTHA', 'MARHTA')
83
+ 0.9611111111111111
84
+ >>> jw.get_sim_score('DWAYNE', 'DUANE')
85
+ 0.84
86
+ >>> jw.get_sim_score('DIXON', 'DICKSONX')
87
+ 0.8133333333333332
88
+ """
89
+ return self.get_raw_score(string1, string2)
90
+
91
+ def get_prefix_weight(self):
92
+ """Get prefix weight.
93
+
94
+ Returns:
95
+ prefix weight (float).
96
+ """
97
+ return self.prefix_weight
98
+
99
+ def set_prefix_weight(self, prefix_weight):
100
+ """Set prefix weight.
101
+
102
+ Args:
103
+ prefix_weight (float): Weight to give to the prefix.
104
+ """
105
+ self.prefix_weight = prefix_weight
106
+ return True
@@ -0,0 +1,75 @@
1
+ from __future__ import division
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.similarity_measure.cython_levenshtein import levenshtein
5
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
6
+ SequenceSimilarityMeasure
7
+
8
+
9
+ class Levenshtein(SequenceSimilarityMeasure):
10
+ """Computes Levenshtein measure (also known as edit distance).
11
+
12
+ Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
13
+ is carried out using a sequence of the following operators: delete a character, insert a character, and
14
+ substitute one character for another.
15
+ """
16
+
17
+ def __init__(self):
18
+ super(Levenshtein, self).__init__()
19
+
20
+ def get_raw_score(self, string1, string2):
21
+ """Computes the raw Levenshtein distance between two strings.
22
+
23
+ Args:
24
+ string1,string2 (str): Input strings.
25
+
26
+ Returns:
27
+ Levenshtein distance (int).
28
+
29
+ Raises:
30
+ TypeError : If the inputs are not strings.
31
+
32
+ Examples:
33
+ >>> lev = Levenshtein()
34
+ >>> lev.get_raw_score('a', '')
35
+ 1
36
+ >>> lev.get_raw_score('example', 'samples')
37
+ 3
38
+ >>> lev.get_raw_score('levenshtein', 'frankenstein')
39
+ 6
40
+ """
41
+
42
+ # input validations
43
+ utils.sim_check_for_none(string1, string2)
44
+ utils.sim_check_for_string_inputs(string1, string2)
45
+ if utils.sim_check_for_exact_match(string1, string2):
46
+ return 0.0
47
+ return levenshtein(string1, string2)
48
+
49
+ def get_sim_score(self, string1, string2):
50
+ """Computes the normalized Levenshtein similarity score between two strings.
51
+
52
+ Args:
53
+ string1,string2 (str): Input strings.
54
+
55
+ Returns:
56
+ Normalized Levenshtein similarity (float).
57
+
58
+ Raises:
59
+ TypeError : If the inputs are not strings.
60
+
61
+ Examples:
62
+ >>> lev = Levenshtein()
63
+ >>> lev.get_sim_score('a', '')
64
+ 0.0
65
+ >>> lev.get_sim_score('example', 'samples')
66
+ 0.5714285714285714
67
+ >>> lev.get_sim_score('levenshtein', 'frankenstein')
68
+ 0.5
69
+
70
+ """
71
+ raw_score = self.get_raw_score(string1, string2)
72
+ max_len = max(len(string1), len(string2))
73
+ if max_len == 0:
74
+ return 1.0
75
+ return 1 - (raw_score / max_len)
@@ -0,0 +1,100 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
3
+ from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
4
+ HybridSimilarityMeasure
5
+
6
+
7
+ class MongeElkan(HybridSimilarityMeasure):
8
+ """Computes Monge-Elkan measure.
9
+
10
+ The Monge-Elkan similarity measure is a type of hybrid similarity measure that combines the benefits of
11
+ sequence-based and set-based methods. This can be effective for domains in which more control is needed
12
+ over the similarity measure. It implicitly uses a secondary similarity measure, such as Levenshtein to compute
13
+ over all similarity score. See the string matching chapter in the DI book (Principles of Data Integration).
14
+
15
+ Args:
16
+ sim_func (function): Secondary similarity function. This is expected to be a sequence-based
17
+ similarity measure (defaults to Jaro-Winkler similarity measure).
18
+
19
+ Attributes:
20
+ sim_func (function): An attribute to store the secondary similarity function.
21
+ """
22
+
23
+ def __init__(self, sim_func=JaroWinkler().get_raw_score):
24
+ self.sim_func = sim_func
25
+ super(MongeElkan, self).__init__()
26
+
27
+ def get_raw_score(self, bag1, bag2):
28
+ """Computes the raw Monge-Elkan score between two bags (lists).
29
+
30
+ Args:
31
+ bag1,bag2 (list): Input lists.
32
+
33
+ Returns:
34
+ Monge-Elkan similarity score (float).
35
+
36
+ Raises:
37
+ TypeError : If the inputs are not lists or if one of the inputs is None.
38
+
39
+ Examples:
40
+ >>> me = MongeElkan()
41
+ >>> me.get_raw_score(['Niall'], ['Neal'])
42
+ 0.8049999999999999
43
+ >>> me.get_raw_score(['Niall'], ['Nigel'])
44
+ 0.7866666666666667
45
+ >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
46
+ 0.8677218614718616
47
+ >>> me.get_raw_score([''], ['a'])
48
+ 0.0
49
+ >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
50
+ >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
51
+ 2.0
52
+ >>> me = MongeElkan(sim_func=Affine().get_raw_score)
53
+ >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
54
+ 2.25
55
+
56
+ References:
57
+ * Principles of Data Integration book
58
+ """
59
+
60
+ # input validations
61
+ utils.sim_check_for_none(bag1, bag2)
62
+ utils.sim_check_for_list_or_set_inputs(bag1, bag2)
63
+
64
+ # if exact match return 1.0
65
+ if utils.sim_check_for_exact_match(bag1, bag2):
66
+ return 1.0
67
+
68
+ # if one of the strings is empty return 0
69
+ if utils.sim_check_for_empty(bag1, bag2):
70
+ return 0
71
+
72
+ # aggregated sum of all the max sim score of all the elements in bag1
73
+ # with elements in bag2
74
+ sum_of_maxes = 0
75
+ for el1 in bag1:
76
+ max_sim = float('-inf')
77
+ for el2 in bag2:
78
+ max_sim = max(max_sim, self.sim_func(el1, el2))
79
+ sum_of_maxes += max_sim
80
+
81
+ sim = float(sum_of_maxes) / float(len(bag1))
82
+
83
+ return sim
84
+
85
+ def get_sim_func(self):
86
+ """Get the secondary similarity function.
87
+
88
+ Returns:
89
+ secondary similarity function (function).
90
+ """
91
+ return self.sim_func
92
+
93
+ def set_sim_func(self, sim_func):
94
+ """Set the secondary similarity function.
95
+
96
+ Args:
97
+ sim_func (function): Secondary similarity function.
98
+ """
99
+ self.sim_func = sim_func
100
+ return True