py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,121 @@
1
+ import numpy as np
2
+
3
+ from py_stringmatching import utils
4
+ from six.moves import xrange
5
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
6
+ SequenceSimilarityMeasure
7
+
8
+
9
+ def sim_ident(char1, char2):
10
+ return int(char1 == char2)
11
+
12
+
13
+ class NeedlemanWunsch(SequenceSimilarityMeasure):
14
+ """Computes Needleman-Wunsch measure.
15
+
16
+ The Needleman-Wunsch distance generalizes the Levenshtein distance and considers global alignment between two strings.
17
+ Specifically, it is computed by assigning a score to each alignment between the two input strings and choosing the
18
+ score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences
19
+ between their characters, allowing for gaps.
20
+
21
+ Args:
22
+ gap_cost (float): Cost of gap (defaults to 1.0).
23
+ sim_func (function): Similarity function to give a score for each correspondence between the characters (defaults
24
+ to an identity function, which returns 1 if the two characters are the same and 0 otherwise.
25
+
26
+ Attributes:
27
+ gap_cost (float): An attribute to store the gap cost.
28
+ sim_func (function): An attribute to store the similarity function.
29
+ """
30
+
31
+ def __init__(self, gap_cost=1.0, sim_func=sim_ident):
32
+ self.gap_cost = gap_cost
33
+ self.sim_func = sim_func
34
+ super(NeedlemanWunsch, self).__init__()
35
+
36
+ def get_raw_score(self, string1, string2):
37
+ """Computes the raw Needleman-Wunsch score between two strings.
38
+
39
+ Args:
40
+ string1,string2 (str) : Input strings.
41
+
42
+ Returns:
43
+ Needleman-Wunsch similarity score (float).
44
+
45
+ Raises:
46
+ TypeError : If the inputs are not strings or if one of the inputs is None.
47
+
48
+ Examples:
49
+ >>> nw = NeedlemanWunsch()
50
+ >>> nw.get_raw_score('dva', 'deeva')
51
+ 1.0
52
+ >>> nw = NeedlemanWunsch(gap_cost=0.0)
53
+ >>> nw.get_raw_score('dva', 'deeve')
54
+ 2.0
55
+ >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
56
+ >>> nw.get_raw_score('dva', 'deeve')
57
+ 1.0
58
+ >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
59
+ >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
60
+ 2.5
61
+ """
62
+
63
+ # input validations
64
+ utils.sim_check_for_none(string1, string2)
65
+ utils.sim_check_for_string_inputs(string1, string2)
66
+
67
+ dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
68
+ dtype=np.float)
69
+
70
+ # DP initialization
71
+ for i in xrange(len(string1) + 1):
72
+ dist_mat[i, 0] = -(i * self.gap_cost)
73
+
74
+ # DP initialization
75
+ for j in xrange(len(string2) + 1):
76
+ dist_mat[0, j] = -(j * self.gap_cost)
77
+
78
+ # Needleman-Wunsch DP calculation
79
+ for i in xrange(1, len(string1) + 1):
80
+ for j in xrange(1, len(string2) + 1):
81
+ match = dist_mat[i - 1, j - 1] + self.sim_func(string1[i - 1],
82
+ string2[j - 1])
83
+ delete = dist_mat[i - 1, j] - self.gap_cost
84
+ insert = dist_mat[i, j - 1] - self.gap_cost
85
+ dist_mat[i, j] = max(match, delete, insert)
86
+
87
+ return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
88
+
89
+ def get_gap_cost(self):
90
+ """Get gap cost.
91
+
92
+ Returns:
93
+ Gap cost (float).
94
+ """
95
+ return self.gap_cost
96
+
97
+ def get_sim_func(self):
98
+ """Get the similarity function.
99
+
100
+ Returns:
101
+ similarity function (function).
102
+ """
103
+ return self.sim_func
104
+
105
+ def set_gap_cost(self, gap_cost):
106
+ """Set gap cost.
107
+
108
+ Args:
109
+ gap_cost (float): Cost of gap.
110
+ """
111
+ self.gap_cost = gap_cost
112
+ return True
113
+
114
+ def set_sim_func(self, sim_func):
115
+ """Set similarity function.
116
+
117
+ Args:
118
+ sim_func (function): Similarity function to give a score for the correspondence between characters.
119
+ """
120
+ self.sim_func = sim_func
121
+ return True
@@ -0,0 +1,86 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.similarity_measure.token_similarity_measure import \
3
+ TokenSimilarityMeasure
4
+
5
+
6
+ class OverlapCoefficient(TokenSimilarityMeasure):
7
+ """Computes overlap coefficient measure.
8
+
9
+ The overlap coefficient is a similarity measure related to the Jaccard
10
+ measure that measures the overlap between two sets, and is defined as the size of the intersection divided by
11
+ the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is:
12
+
13
+ :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`
14
+ """
15
+
16
+ def __init__(self):
17
+ super(OverlapCoefficient, self).__init__()
18
+
19
+ def get_raw_score(self, set1, set2):
20
+ """Computes the raw overlap coefficient score between two sets.
21
+
22
+ Args:
23
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
24
+
25
+ Returns:
26
+ Overlap coefficient (float).
27
+
28
+ Raises:
29
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
30
+
31
+ Examples:
32
+ >>> oc = OverlapCoefficient()
33
+ >>> oc.get_raw_score(['data', 'science'], ['data'])
34
+ 1.0
35
+ >>> oc.get_raw_score([], [])
36
+ 1.0
37
+ >>> oc.get_raw_score([], ['data'])
38
+ 0
39
+
40
+ References:
41
+ * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
42
+ * SimMetrics library
43
+ """
44
+
45
+ # input validations
46
+ utils.sim_check_for_none(set1, set2)
47
+ utils.sim_check_for_list_or_set_inputs(set1, set2)
48
+
49
+ # if exact match return 1.0
50
+ if utils.sim_check_for_exact_match(set1, set2):
51
+ return 1.0
52
+
53
+ # if one of the strings is empty return 0
54
+ if utils.sim_check_for_empty(set1, set2):
55
+ return 0
56
+
57
+ if not isinstance(set1, set):
58
+ set1 = set(set1)
59
+ if not isinstance(set2, set):
60
+ set2 = set(set2)
61
+
62
+ return float(len(set1 & set2)) / min(len(set1), len(set2))
63
+
64
+ def get_sim_score(self, set1, set2):
65
+ """Computes the normalized overlap coefficient between two sets. Simply call get_raw_score.
66
+
67
+ Args:
68
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
69
+
70
+ Returns:
71
+ Normalized overlap coefficient (float).
72
+
73
+ Raises:
74
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
75
+
76
+ Examples:
77
+ >>> oc = OverlapCoefficient()
78
+ >>> oc.get_sim_score(['data', 'science'], ['data'])
79
+ 1.0
80
+ >>> oc.get_sim_score([], [])
81
+ 1.0
82
+ >>> oc.get_sim_score([], ['data'])
83
+ 0
84
+
85
+ """
86
+ return self.get_raw_score(set1, set2)
@@ -0,0 +1,7 @@
1
+ """Sequence based similarity measure"""
2
+
3
+ from py_stringmatching.similarity_measure.similarity_measure import \
4
+ SimilarityMeasure
5
+
6
+ class SequenceSimilarityMeasure(SimilarityMeasure):
7
+ pass
@@ -0,0 +1,4 @@
1
+ """Similarity measure"""
2
+
3
+ class SimilarityMeasure(object):
4
+ pass
@@ -0,0 +1,115 @@
1
+ # coding=utf-8
2
+
3
+ import numpy as np
4
+
5
+ from py_stringmatching import utils
6
+ from six.moves import xrange
7
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
8
+ SequenceSimilarityMeasure
9
+
10
+
11
+ def sim_ident(char1, char2):
12
+ return int(char1 == char2)
13
+
14
+
15
+ class SmithWaterman(SequenceSimilarityMeasure):
16
+ """Computes Smith-Waterman measure.
17
+
18
+ The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions
19
+ between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
20
+ all possible lengths and optimizes the similarity measure. See the string matching chapter in the DI book (Principles of Data Integration).
21
+
22
+ Args:
23
+ gap_cost (float): Cost of gap (defaults to 1.0).
24
+ sim_func (function): Similarity function to give a score for the correspondence between the characters (defaults
25
+ to an identity function, which returns 1 if the two characters are the same and 0 otherwise).
26
+
27
+ Attributes:
28
+ gap_cost (float): An attribute to store the gap cost.
29
+ sim_func (function): An attribute to store the similarity function.
30
+ """
31
+
32
+ def __init__(self, gap_cost=1.0, sim_func=sim_ident):
33
+ self.gap_cost = gap_cost
34
+ self.sim_func = sim_func
35
+ super(SmithWaterman, self).__init__()
36
+
37
+ def get_raw_score(self, string1, string2):
38
+ """Computes the raw Smith-Waterman score between two strings.
39
+
40
+ Args:
41
+ string1,string2 (str) : Input strings.
42
+
43
+ Returns:
44
+ Smith-Waterman similarity score (float).
45
+
46
+ Raises:
47
+ TypeError : If the inputs are not strings or if one of the inputs is None.
48
+
49
+ Examples:
50
+ >>> sw = SmithWaterman()
51
+ >>> sw.get_raw_score('cat', 'hat')
52
+ 2.0
53
+ >>> sw = SmithWaterman(gap_cost=2.2)
54
+ >>> sw.get_raw_score('dva', 'deeve')
55
+ 1.0
56
+ >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
57
+ >>> sw.get_raw_score('dva', 'deeve')
58
+ 2.0
59
+ >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
60
+ >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
61
+ 6.5
62
+ """
63
+
64
+ # input validations
65
+ utils.sim_check_for_none(string1, string2)
66
+ utils.sim_check_for_string_inputs(string1, string2)
67
+
68
+ dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
69
+ dtype=np.float)
70
+ max_value = 0
71
+ # Smith Waterman DP calculations
72
+ for i in xrange(1, len(string1) + 1):
73
+ for j in xrange(1, len(string2) + 1):
74
+ match = dist_mat[i - 1, j - 1] + self.sim_func(string1[i - 1],
75
+ string2[j - 1])
76
+ delete = dist_mat[i - 1, j] - self.gap_cost
77
+ insert = dist_mat[i, j - 1] - self.gap_cost
78
+ dist_mat[i, j] = max(0, match, delete, insert)
79
+ max_value = max(max_value, dist_mat[i, j])
80
+
81
+ return max_value
82
+
83
+ def get_gap_cost(self):
84
+ """Get gap cost.
85
+
86
+ Returns:
87
+ Gap cost (float).
88
+ """
89
+ return self.gap_cost
90
+
91
+ def get_sim_func(self):
92
+ """Get similarity function.
93
+
94
+ Returns:
95
+ Similarity function (function).
96
+ """
97
+ return self.sim_func
98
+
99
+ def set_gap_cost(self, gap_cost):
100
+ """Set gap cost.
101
+
102
+ Args:
103
+ gap_cost (float): Cost of gap.
104
+ """
105
+ self.gap_cost = gap_cost
106
+ return True
107
+
108
+ def set_sim_func(self, sim_func):
109
+ """Set similarity function.
110
+
111
+ Args:
112
+ sim_func (function): Similarity function to give a score for the correspondence between the characters.
113
+ """
114
+ self.sim_func = sim_func
115
+ return True
@@ -0,0 +1,198 @@
1
+ from __future__ import division
2
+ from math import sqrt
3
+ import collections
4
+
5
+ from py_stringmatching import utils
6
+ from py_stringmatching.similarity_measure.jaro import Jaro
7
+ from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
8
+ HybridSimilarityMeasure
9
+
10
+
11
+ class SoftTfIdf(HybridSimilarityMeasure):
12
+ """Computes soft TF/IDF measure.
13
+
14
+ Args:
15
+ corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
16
+ the input list are considered the only corpus.
17
+ sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional),
18
+ default is the Jaro similarity measure.
19
+ threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
20
+ of a token pair exceeds the threshold, then the token pair is considered a match.
21
+
22
+ Attributes:
23
+ sim_func (function): An attribute to store the secondary similarity function.
24
+ threshold (float): An attribute to store the threshold value for the secondary similarity function.
25
+
26
+ Note:
27
+ Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF.
28
+ We plan to add the dampen flag in the next release.
29
+ """
30
+
31
+ def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
32
+ threshold=0.5):
33
+ self.__corpus_list = corpus_list
34
+ self.__document_frequency = {}
35
+ self.__compute_document_frequency()
36
+ self.__corpus_size = 0 if self.__corpus_list is None else (
37
+ len(self.__corpus_list))
38
+ self.sim_func = sim_func
39
+ self.threshold = threshold
40
+ super(SoftTfIdf, self).__init__()
41
+
42
+ def get_raw_score(self, bag1, bag2):
43
+ """Computes the raw soft TF/IDF score between two lists given the corpus information.
44
+
45
+ Args:
46
+ bag1,bag2 (list): Input lists
47
+
48
+ Returns:
49
+ Soft TF/IDF score between the input lists (float).
50
+
51
+ Raises:
52
+ TypeError : If the inputs are not lists or if one of the inputs is None.
53
+
54
+ Examples:
55
+ >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
56
+ >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
57
+ 0.17541160386140586
58
+ >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
59
+ >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
60
+ 0.5547001962252291
61
+ >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
62
+ >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
63
+ 0.0
64
+ >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
65
+ >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
66
+ 0.81649658092772592
67
+
68
+ References:
69
+ * the string matching chapter of the "Principles of Data Integration" book.
70
+ """
71
+
72
+ # input validations
73
+ utils.sim_check_for_none(bag1, bag2)
74
+ utils.sim_check_for_list_or_set_inputs(bag1, bag2)
75
+
76
+ # if the strings match exactly return 1.0
77
+ if utils.sim_check_for_exact_match(bag1, bag2):
78
+ return 1.0
79
+
80
+ # if one of the strings is empty return 0
81
+ if utils.sim_check_for_empty(bag1, bag2):
82
+ return 0
83
+
84
+ # term frequency for input strings
85
+ tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
86
+
87
+ # find unique elements in the input lists and their document frequency
88
+ local_df = {}
89
+ for element in tf_x:
90
+ local_df[element] = local_df.get(element, 0) + 1
91
+ for element in tf_y:
92
+ local_df[element] = local_df.get(element, 0) + 1
93
+
94
+ # if corpus is not provided treat input string as corpus
95
+ curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
96
+ (self.__document_frequency, self.__corpus_size))
97
+
98
+ # calculating the term sim score against the input string 2,
99
+ # construct similarity map
100
+ similarity_map = {}
101
+ for term_x in tf_x:
102
+ max_score = 0.0
103
+ for term_y in tf_y:
104
+ score = self.sim_func(term_x, term_y)
105
+ # adding sim only if it is above threshold and
106
+ # highest for this element
107
+ if score > self.threshold and score > max_score:
108
+ similarity_map[term_x] = (term_x, term_y, score)
109
+ max_score = score
110
+
111
+ # position of first string, second string and sim score
112
+ # in the tuple
113
+ first_string_pos = 0
114
+ second_string_pos = 1
115
+ sim_score_pos = 2
116
+
117
+ result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
118
+ # soft-tfidf calculation
119
+ for element in local_df.keys():
120
+ if curr_df.get(element) is None:
121
+ continue
122
+ # numerator
123
+ if element in similarity_map:
124
+ sim = similarity_map[element]
125
+ idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
126
+ idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
127
+ v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
128
+ v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
129
+ result += v_x * v_y * sim[sim_score_pos]
130
+ # denominator
131
+ idf = corpus_size / curr_df[element]
132
+ v_x = idf * tf_x.get(element, 0)
133
+ v_x_2 += v_x * v_x
134
+ v_y = idf * tf_y.get(element, 0)
135
+ v_y_2 += v_y * v_y
136
+ return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
137
+
138
+ def get_corpus_list(self):
139
+ """Get corpus list.
140
+
141
+ Returns:
142
+ corpus list (list of lists).
143
+ """
144
+ return self.__corpus_list
145
+
146
+ def get_sim_func(self):
147
+ """Get secondary similarity function.
148
+
149
+ Returns:
150
+ secondary similarity function (function).
151
+ """
152
+ return self.sim_func
153
+
154
+ def get_threshold(self):
155
+ """Get threshold used for the secondary similarity function.
156
+
157
+ Returns:
158
+ threshold (float).
159
+ """
160
+ return self.threshold
161
+
162
+ def set_threshold(self, threshold):
163
+ """Set threshold value for the secondary similarity function.
164
+
165
+ Args:
166
+ threshold (float): threshold value.
167
+ """
168
+ self.threshold = threshold
169
+ return True
170
+
171
+ def set_sim_func(self, sim_func):
172
+ """Set secondary similarity function.
173
+
174
+ Args:
175
+ sim_func (function): Secondary similarity function.
176
+ """
177
+ self.sim_func = sim_func
178
+ return True
179
+
180
+ def set_corpus_list(self, corpus_list):
181
+ """Set corpus list.
182
+
183
+ Args:
184
+ corpus_list (list of lists): Corpus list.
185
+ """
186
+ self.__corpus_list = corpus_list
187
+ self.__document_frequency = {}
188
+ self.__compute_document_frequency()
189
+ self.__corpus_size = 0 if self.__corpus_list is None else (
190
+ len(self.__corpus_list))
191
+ return True
192
+
193
+ def __compute_document_frequency(self):
194
+ if self.__corpus_list != None:
195
+ for document in self.__corpus_list:
196
+ for element in set(document):
197
+ self.__document_frequency[element] = (
198
+ self.__document_frequency.get(element, 0) + 1)