py-stringmatching 0.1.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_stringmatching-0.1.0/AUTHORS.rst +6 -0
- py_stringmatching-0.1.0/CHANGES.txt +6 -0
- py_stringmatching-0.1.0/LICENSE +27 -0
- py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
- py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
- py_stringmatching-0.1.0/MANIFEST.in +6 -0
- py_stringmatching-0.1.0/PKG-INFO +57 -0
- py_stringmatching-0.1.0/README.rst +27 -0
- py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
- py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
- py_stringmatching-0.1.0/requirements.txt +2 -0
- py_stringmatching-0.1.0/setup.cfg +5 -0
- py_stringmatching-0.1.0/setup.py +107 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from py_stringmatching import utils
|
|
4
|
+
from six.moves import xrange
|
|
5
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
6
|
+
SequenceSimilarityMeasure
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def sim_ident(char1, char2):
|
|
10
|
+
return int(char1 == char2)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class NeedlemanWunsch(SequenceSimilarityMeasure):
|
|
14
|
+
"""Computes Needleman-Wunsch measure.
|
|
15
|
+
|
|
16
|
+
The Needleman-Wunsch distance generalizes the Levenshtein distance and considers global alignment between two strings.
|
|
17
|
+
Specifically, it is computed by assigning a score to each alignment between the two input strings and choosing the
|
|
18
|
+
score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences
|
|
19
|
+
between their characters, allowing for gaps.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
gap_cost (float): Cost of gap (defaults to 1.0).
|
|
23
|
+
sim_func (function): Similarity function to give a score for each correspondence between the characters (defaults
|
|
24
|
+
to an identity function, which returns 1 if the two characters are the same and 0 otherwise.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
gap_cost (float): An attribute to store the gap cost.
|
|
28
|
+
sim_func (function): An attribute to store the similarity function.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, gap_cost=1.0, sim_func=sim_ident):
|
|
32
|
+
self.gap_cost = gap_cost
|
|
33
|
+
self.sim_func = sim_func
|
|
34
|
+
super(NeedlemanWunsch, self).__init__()
|
|
35
|
+
|
|
36
|
+
def get_raw_score(self, string1, string2):
|
|
37
|
+
"""Computes the raw Needleman-Wunsch score between two strings.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
string1,string2 (str) : Input strings.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Needleman-Wunsch similarity score (float).
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
>>> nw = NeedlemanWunsch()
|
|
50
|
+
>>> nw.get_raw_score('dva', 'deeva')
|
|
51
|
+
1.0
|
|
52
|
+
>>> nw = NeedlemanWunsch(gap_cost=0.0)
|
|
53
|
+
>>> nw.get_raw_score('dva', 'deeve')
|
|
54
|
+
2.0
|
|
55
|
+
>>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
|
|
56
|
+
>>> nw.get_raw_score('dva', 'deeve')
|
|
57
|
+
1.0
|
|
58
|
+
>>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
|
|
59
|
+
>>> nw.get_raw_score('GCATGCUA', 'GATTACA')
|
|
60
|
+
2.5
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# input validations
|
|
64
|
+
utils.sim_check_for_none(string1, string2)
|
|
65
|
+
utils.sim_check_for_string_inputs(string1, string2)
|
|
66
|
+
|
|
67
|
+
dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
|
|
68
|
+
dtype=np.float)
|
|
69
|
+
|
|
70
|
+
# DP initialization
|
|
71
|
+
for i in xrange(len(string1) + 1):
|
|
72
|
+
dist_mat[i, 0] = -(i * self.gap_cost)
|
|
73
|
+
|
|
74
|
+
# DP initialization
|
|
75
|
+
for j in xrange(len(string2) + 1):
|
|
76
|
+
dist_mat[0, j] = -(j * self.gap_cost)
|
|
77
|
+
|
|
78
|
+
# Needleman-Wunsch DP calculation
|
|
79
|
+
for i in xrange(1, len(string1) + 1):
|
|
80
|
+
for j in xrange(1, len(string2) + 1):
|
|
81
|
+
match = dist_mat[i - 1, j - 1] + self.sim_func(string1[i - 1],
|
|
82
|
+
string2[j - 1])
|
|
83
|
+
delete = dist_mat[i - 1, j] - self.gap_cost
|
|
84
|
+
insert = dist_mat[i, j - 1] - self.gap_cost
|
|
85
|
+
dist_mat[i, j] = max(match, delete, insert)
|
|
86
|
+
|
|
87
|
+
return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
|
|
88
|
+
|
|
89
|
+
def get_gap_cost(self):
|
|
90
|
+
"""Get gap cost.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Gap cost (float).
|
|
94
|
+
"""
|
|
95
|
+
return self.gap_cost
|
|
96
|
+
|
|
97
|
+
def get_sim_func(self):
|
|
98
|
+
"""Get the similarity function.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
similarity function (function).
|
|
102
|
+
"""
|
|
103
|
+
return self.sim_func
|
|
104
|
+
|
|
105
|
+
def set_gap_cost(self, gap_cost):
|
|
106
|
+
"""Set gap cost.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
gap_cost (float): Cost of gap.
|
|
110
|
+
"""
|
|
111
|
+
self.gap_cost = gap_cost
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
def set_sim_func(self, sim_func):
|
|
115
|
+
"""Set similarity function.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
sim_func (function): Similarity function to give a score for the correspondence between characters.
|
|
119
|
+
"""
|
|
120
|
+
self.sim_func = sim_func
|
|
121
|
+
return True
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.similarity_measure.token_similarity_measure import \
|
|
3
|
+
TokenSimilarityMeasure
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OverlapCoefficient(TokenSimilarityMeasure):
|
|
7
|
+
"""Computes overlap coefficient measure.
|
|
8
|
+
|
|
9
|
+
The overlap coefficient is a similarity measure related to the Jaccard
|
|
10
|
+
measure that measures the overlap between two sets, and is defined as the size of the intersection divided by
|
|
11
|
+
the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is:
|
|
12
|
+
|
|
13
|
+
:math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
super(OverlapCoefficient, self).__init__()
|
|
18
|
+
|
|
19
|
+
def get_raw_score(self, set1, set2):
|
|
20
|
+
"""Computes the raw overlap coefficient score between two sets.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Overlap coefficient (float).
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
>>> oc = OverlapCoefficient()
|
|
33
|
+
>>> oc.get_raw_score(['data', 'science'], ['data'])
|
|
34
|
+
1.0
|
|
35
|
+
>>> oc.get_raw_score([], [])
|
|
36
|
+
1.0
|
|
37
|
+
>>> oc.get_raw_score([], ['data'])
|
|
38
|
+
0
|
|
39
|
+
|
|
40
|
+
References:
|
|
41
|
+
* Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
|
|
42
|
+
* SimMetrics library
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# input validations
|
|
46
|
+
utils.sim_check_for_none(set1, set2)
|
|
47
|
+
utils.sim_check_for_list_or_set_inputs(set1, set2)
|
|
48
|
+
|
|
49
|
+
# if exact match return 1.0
|
|
50
|
+
if utils.sim_check_for_exact_match(set1, set2):
|
|
51
|
+
return 1.0
|
|
52
|
+
|
|
53
|
+
# if one of the strings is empty return 0
|
|
54
|
+
if utils.sim_check_for_empty(set1, set2):
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
if not isinstance(set1, set):
|
|
58
|
+
set1 = set(set1)
|
|
59
|
+
if not isinstance(set2, set):
|
|
60
|
+
set2 = set(set2)
|
|
61
|
+
|
|
62
|
+
return float(len(set1 & set2)) / min(len(set1), len(set2))
|
|
63
|
+
|
|
64
|
+
def get_sim_score(self, set1, set2):
|
|
65
|
+
"""Computes the normalized overlap coefficient between two sets. Simply call get_raw_score.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Normalized overlap coefficient (float).
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
>>> oc = OverlapCoefficient()
|
|
78
|
+
>>> oc.get_sim_score(['data', 'science'], ['data'])
|
|
79
|
+
1.0
|
|
80
|
+
>>> oc.get_sim_score([], [])
|
|
81
|
+
1.0
|
|
82
|
+
>>> oc.get_sim_score([], ['data'])
|
|
83
|
+
0
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
return self.get_raw_score(set1, set2)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from py_stringmatching import utils
|
|
6
|
+
from six.moves import xrange
|
|
7
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
8
|
+
SequenceSimilarityMeasure
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def sim_ident(char1, char2):
|
|
12
|
+
return int(char1 == char2)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SmithWaterman(SequenceSimilarityMeasure):
|
|
16
|
+
"""Computes Smith-Waterman measure.
|
|
17
|
+
|
|
18
|
+
The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions
|
|
19
|
+
between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
|
|
20
|
+
all possible lengths and optimizes the similarity measure. See the string matching chapter in the DI book (Principles of Data Integration).
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
gap_cost (float): Cost of gap (defaults to 1.0).
|
|
24
|
+
sim_func (function): Similarity function to give a score for the correspondence between the characters (defaults
|
|
25
|
+
to an identity function, which returns 1 if the two characters are the same and 0 otherwise).
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
gap_cost (float): An attribute to store the gap cost.
|
|
29
|
+
sim_func (function): An attribute to store the similarity function.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, gap_cost=1.0, sim_func=sim_ident):
|
|
33
|
+
self.gap_cost = gap_cost
|
|
34
|
+
self.sim_func = sim_func
|
|
35
|
+
super(SmithWaterman, self).__init__()
|
|
36
|
+
|
|
37
|
+
def get_raw_score(self, string1, string2):
|
|
38
|
+
"""Computes the raw Smith-Waterman score between two strings.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
string1,string2 (str) : Input strings.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Smith-Waterman similarity score (float).
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
>>> sw = SmithWaterman()
|
|
51
|
+
>>> sw.get_raw_score('cat', 'hat')
|
|
52
|
+
2.0
|
|
53
|
+
>>> sw = SmithWaterman(gap_cost=2.2)
|
|
54
|
+
>>> sw.get_raw_score('dva', 'deeve')
|
|
55
|
+
1.0
|
|
56
|
+
>>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
|
|
57
|
+
>>> sw.get_raw_score('dva', 'deeve')
|
|
58
|
+
2.0
|
|
59
|
+
>>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
|
|
60
|
+
>>> sw.get_raw_score('GCATAGCU', 'GATTACA')
|
|
61
|
+
6.5
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# input validations
|
|
65
|
+
utils.sim_check_for_none(string1, string2)
|
|
66
|
+
utils.sim_check_for_string_inputs(string1, string2)
|
|
67
|
+
|
|
68
|
+
dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
|
|
69
|
+
dtype=np.float)
|
|
70
|
+
max_value = 0
|
|
71
|
+
# Smith Waterman DP calculations
|
|
72
|
+
for i in xrange(1, len(string1) + 1):
|
|
73
|
+
for j in xrange(1, len(string2) + 1):
|
|
74
|
+
match = dist_mat[i - 1, j - 1] + self.sim_func(string1[i - 1],
|
|
75
|
+
string2[j - 1])
|
|
76
|
+
delete = dist_mat[i - 1, j] - self.gap_cost
|
|
77
|
+
insert = dist_mat[i, j - 1] - self.gap_cost
|
|
78
|
+
dist_mat[i, j] = max(0, match, delete, insert)
|
|
79
|
+
max_value = max(max_value, dist_mat[i, j])
|
|
80
|
+
|
|
81
|
+
return max_value
|
|
82
|
+
|
|
83
|
+
def get_gap_cost(self):
|
|
84
|
+
"""Get gap cost.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Gap cost (float).
|
|
88
|
+
"""
|
|
89
|
+
return self.gap_cost
|
|
90
|
+
|
|
91
|
+
def get_sim_func(self):
|
|
92
|
+
"""Get similarity function.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Similarity function (function).
|
|
96
|
+
"""
|
|
97
|
+
return self.sim_func
|
|
98
|
+
|
|
99
|
+
def set_gap_cost(self, gap_cost):
|
|
100
|
+
"""Set gap cost.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
gap_cost (float): Cost of gap.
|
|
104
|
+
"""
|
|
105
|
+
self.gap_cost = gap_cost
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
def set_sim_func(self, sim_func):
|
|
109
|
+
"""Set similarity function.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
sim_func (function): Similarity function to give a score for the correspondence between the characters.
|
|
113
|
+
"""
|
|
114
|
+
self.sim_func = sim_func
|
|
115
|
+
return True
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from __future__ import division
|
|
2
|
+
from math import sqrt
|
|
3
|
+
import collections
|
|
4
|
+
|
|
5
|
+
from py_stringmatching import utils
|
|
6
|
+
from py_stringmatching.similarity_measure.jaro import Jaro
|
|
7
|
+
from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
|
|
8
|
+
HybridSimilarityMeasure
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SoftTfIdf(HybridSimilarityMeasure):
|
|
12
|
+
"""Computes soft TF/IDF measure.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
|
|
16
|
+
the input list are considered the only corpus.
|
|
17
|
+
sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional),
|
|
18
|
+
default is the Jaro similarity measure.
|
|
19
|
+
threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
|
|
20
|
+
of a token pair exceeds the threshold, then the token pair is considered a match.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
sim_func (function): An attribute to store the secondary similarity function.
|
|
24
|
+
threshold (float): An attribute to store the threshold value for the secondary similarity function.
|
|
25
|
+
|
|
26
|
+
Note:
|
|
27
|
+
Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF.
|
|
28
|
+
We plan to add the dampen flag in the next release.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
|
|
32
|
+
threshold=0.5):
|
|
33
|
+
self.__corpus_list = corpus_list
|
|
34
|
+
self.__document_frequency = {}
|
|
35
|
+
self.__compute_document_frequency()
|
|
36
|
+
self.__corpus_size = 0 if self.__corpus_list is None else (
|
|
37
|
+
len(self.__corpus_list))
|
|
38
|
+
self.sim_func = sim_func
|
|
39
|
+
self.threshold = threshold
|
|
40
|
+
super(SoftTfIdf, self).__init__()
|
|
41
|
+
|
|
42
|
+
def get_raw_score(self, bag1, bag2):
|
|
43
|
+
"""Computes the raw soft TF/IDF score between two lists given the corpus information.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
bag1,bag2 (list): Input lists
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Soft TF/IDF score between the input lists (float).
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
TypeError : If the inputs are not lists or if one of the inputs is None.
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
>>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
|
|
56
|
+
>>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
|
|
57
|
+
0.17541160386140586
|
|
58
|
+
>>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
|
|
59
|
+
>>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
|
|
60
|
+
0.5547001962252291
|
|
61
|
+
>>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
|
|
62
|
+
>>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
|
|
63
|
+
0.0
|
|
64
|
+
>>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
|
|
65
|
+
>>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
|
|
66
|
+
0.81649658092772592
|
|
67
|
+
|
|
68
|
+
References:
|
|
69
|
+
* the string matching chapter of the "Principles of Data Integration" book.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# input validations
|
|
73
|
+
utils.sim_check_for_none(bag1, bag2)
|
|
74
|
+
utils.sim_check_for_list_or_set_inputs(bag1, bag2)
|
|
75
|
+
|
|
76
|
+
# if the strings match exactly return 1.0
|
|
77
|
+
if utils.sim_check_for_exact_match(bag1, bag2):
|
|
78
|
+
return 1.0
|
|
79
|
+
|
|
80
|
+
# if one of the strings is empty return 0
|
|
81
|
+
if utils.sim_check_for_empty(bag1, bag2):
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
# term frequency for input strings
|
|
85
|
+
tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
|
|
86
|
+
|
|
87
|
+
# find unique elements in the input lists and their document frequency
|
|
88
|
+
local_df = {}
|
|
89
|
+
for element in tf_x:
|
|
90
|
+
local_df[element] = local_df.get(element, 0) + 1
|
|
91
|
+
for element in tf_y:
|
|
92
|
+
local_df[element] = local_df.get(element, 0) + 1
|
|
93
|
+
|
|
94
|
+
# if corpus is not provided treat input string as corpus
|
|
95
|
+
curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
|
|
96
|
+
(self.__document_frequency, self.__corpus_size))
|
|
97
|
+
|
|
98
|
+
# calculating the term sim score against the input string 2,
|
|
99
|
+
# construct similarity map
|
|
100
|
+
similarity_map = {}
|
|
101
|
+
for term_x in tf_x:
|
|
102
|
+
max_score = 0.0
|
|
103
|
+
for term_y in tf_y:
|
|
104
|
+
score = self.sim_func(term_x, term_y)
|
|
105
|
+
# adding sim only if it is above threshold and
|
|
106
|
+
# highest for this element
|
|
107
|
+
if score > self.threshold and score > max_score:
|
|
108
|
+
similarity_map[term_x] = (term_x, term_y, score)
|
|
109
|
+
max_score = score
|
|
110
|
+
|
|
111
|
+
# position of first string, second string and sim score
|
|
112
|
+
# in the tuple
|
|
113
|
+
first_string_pos = 0
|
|
114
|
+
second_string_pos = 1
|
|
115
|
+
sim_score_pos = 2
|
|
116
|
+
|
|
117
|
+
result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
|
|
118
|
+
# soft-tfidf calculation
|
|
119
|
+
for element in local_df.keys():
|
|
120
|
+
if curr_df.get(element) is None:
|
|
121
|
+
continue
|
|
122
|
+
# numerator
|
|
123
|
+
if element in similarity_map:
|
|
124
|
+
sim = similarity_map[element]
|
|
125
|
+
idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
|
|
126
|
+
idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
|
|
127
|
+
v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
|
|
128
|
+
v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
|
|
129
|
+
result += v_x * v_y * sim[sim_score_pos]
|
|
130
|
+
# denominator
|
|
131
|
+
idf = corpus_size / curr_df[element]
|
|
132
|
+
v_x = idf * tf_x.get(element, 0)
|
|
133
|
+
v_x_2 += v_x * v_x
|
|
134
|
+
v_y = idf * tf_y.get(element, 0)
|
|
135
|
+
v_y_2 += v_y * v_y
|
|
136
|
+
return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
|
|
137
|
+
|
|
138
|
+
def get_corpus_list(self):
|
|
139
|
+
"""Get corpus list.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
corpus list (list of lists).
|
|
143
|
+
"""
|
|
144
|
+
return self.__corpus_list
|
|
145
|
+
|
|
146
|
+
def get_sim_func(self):
|
|
147
|
+
"""Get secondary similarity function.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
secondary similarity function (function).
|
|
151
|
+
"""
|
|
152
|
+
return self.sim_func
|
|
153
|
+
|
|
154
|
+
def get_threshold(self):
|
|
155
|
+
"""Get threshold used for the secondary similarity function.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
threshold (float).
|
|
159
|
+
"""
|
|
160
|
+
return self.threshold
|
|
161
|
+
|
|
162
|
+
def set_threshold(self, threshold):
|
|
163
|
+
"""Set threshold value for the secondary similarity function.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
threshold (float): threshold value.
|
|
167
|
+
"""
|
|
168
|
+
self.threshold = threshold
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
def set_sim_func(self, sim_func):
|
|
172
|
+
"""Set secondary similarity function.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
sim_func (function): Secondary similarity function.
|
|
176
|
+
"""
|
|
177
|
+
self.sim_func = sim_func
|
|
178
|
+
return True
|
|
179
|
+
|
|
180
|
+
def set_corpus_list(self, corpus_list):
|
|
181
|
+
"""Set corpus list.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
corpus_list (list of lists): Corpus list.
|
|
185
|
+
"""
|
|
186
|
+
self.__corpus_list = corpus_list
|
|
187
|
+
self.__document_frequency = {}
|
|
188
|
+
self.__compute_document_frequency()
|
|
189
|
+
self.__corpus_size = 0 if self.__corpus_list is None else (
|
|
190
|
+
len(self.__corpus_list))
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
def __compute_document_frequency(self):
|
|
194
|
+
if self.__corpus_list != None:
|
|
195
|
+
for document in self.__corpus_list:
|
|
196
|
+
for element in set(document):
|
|
197
|
+
self.__document_frequency[element] = (
|
|
198
|
+
self.__document_frequency.get(element, 0) + 1)
|