py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,193 @@
1
+ from __future__ import division
2
+ from math import log, sqrt
3
+ import collections
4
+
5
+ from py_stringmatching import utils
6
+ from py_stringmatching.similarity_measure.token_similarity_measure import \
7
+ TokenSimilarityMeasure
8
+
9
+
10
+ class TfIdf(TokenSimilarityMeasure):
11
+ """Computes TF/IDF measure.
12
+
13
+ This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to
14
+ find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure
15
+ is that two strings are similar if they share distinguishing terms. See the string matching chapter in the book "Principles of Data Integration"
16
+
17
+ Note:
18
+ Currently when you create a TF/IDF similarity measure object, the dampen flag is set to False by default. In most cases, you will want to set this flag to True, so that the TF and IDF formulas use logarithmic. So when creating this object, consider setting the flag to True. This will likely be fixed in the next release.
19
+
20
+ Args:
21
+ corpus_list (list of lists): The corpus that will be used to compute TF and IDF values. This corpus is a list of strings, where each string has been tokenized into a list of tokens (that is, a bag of tokens). The default is set to None. In this case, when we call this TF/IDF measure on two input strings (using get_raw_score or get_sim_score), the corpus is taken to be the list of those two strings.
22
+ dampen (boolean): Flag to indicate whether 'log' should be used in TF and IDF formulas. In general this flag should be set to True.
23
+
24
+ Attributes:
25
+ dampen (boolean): An attribute to store the dampen flag.
26
+ """
27
+
28
+ def __init__(self, corpus_list=None, dampen=False):
29
+ self.__corpus_list = corpus_list
30
+ self.__document_frequency = {}
31
+ self.__compute_document_frequency()
32
+ self.__corpus_size = 0 if self.__corpus_list is None else (
33
+ len(self.__corpus_list))
34
+ self.dampen = dampen
35
+ super(TfIdf, self).__init__()
36
+
37
+ def get_raw_score(self, bag1, bag2):
38
+ """Computes the raw TF/IDF score between two lists.
39
+
40
+ Args:
41
+ bag1,bag2 (list): Input lists.
42
+
43
+ Returns:
44
+ TF/IDF score between the input lists (float).
45
+
46
+ Raises:
47
+ TypeError : If the inputs are not lists or if one of the inputs is None.
48
+
49
+ Examples:
50
+
51
+ >>> # here the corpus is a list of three strings that
52
+ >>> # have been tokenized into three lists of tokens
53
+ >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
54
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
55
+ 0.17541160386140586
56
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
57
+ 0.5547001962252291
58
+ >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True)
59
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
60
+ 0.11166746710505392
61
+ >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
62
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
63
+ 0.0
64
+ >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']], True)
65
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
66
+ 0.0
67
+ >>> tfidf = TfIdf()
68
+ >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
69
+ 0.7071067811865475
70
+ """
71
+ # input validations
72
+ utils.sim_check_for_none(bag1, bag2)
73
+ utils.sim_check_for_list_or_set_inputs(bag1, bag2)
74
+
75
+ # if the strings match exactly return 1.0
76
+ if utils.sim_check_for_exact_match(bag1, bag2):
77
+ return 1.0
78
+
79
+ # if one of the strings is empty return 0
80
+ if utils.sim_check_for_empty(bag1, bag2):
81
+ return 0
82
+
83
+ # term frequency for input strings
84
+ tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
85
+
86
+ # find unique elements in the input lists and their document frequency
87
+ local_df = {}
88
+ for element in tf_x:
89
+ local_df[element] = local_df.get(element, 0) + 1
90
+ for element in tf_y:
91
+ local_df[element] = local_df.get(element, 0) + 1
92
+
93
+ # if corpus is not provided treat input string as corpus
94
+ curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
95
+ (self.__document_frequency, self.__corpus_size))
96
+
97
+ idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0,
98
+ 0.0, 0.0, 0.0)
99
+
100
+ # tfidf calculation
101
+ for element in local_df.keys():
102
+ df_element = curr_df.get(element)
103
+ if df_element is None:
104
+ continue
105
+ idf_element = corpus_size * 1.0 / df_element
106
+ v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else (
107
+ idf_element * tf_x[element])
108
+ v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else (
109
+ idf_element * tf_y[element])
110
+ v_x_y += v_x * v_y
111
+ v_x_2 += v_x * v_x
112
+ v_y_2 += v_y * v_y
113
+
114
+ return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2))
115
+
116
+ def get_sim_score(self, bag1, bag2):
117
+ """Computes the normalized TF/IDF similarity score between two lists. Simply call get_raw_score.
118
+
119
+ Args:
120
+ bag1,bag2 (list): Input lists.
121
+
122
+ Returns:
123
+ Normalized TF/IDF similarity score between the input lists (float).
124
+
125
+ Raises:
126
+ TypeError : If the inputs are not lists or if one of the inputs is None.
127
+
128
+ Examples:
129
+
130
+ >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
131
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'c'])
132
+ 0.17541160386140586
133
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
134
+ 0.5547001962252291
135
+ >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True)
136
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'c'])
137
+ 0.11166746710505392
138
+ >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
139
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
140
+ 0.0
141
+ >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']], True)
142
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
143
+ 0.0
144
+ >>> tfidf = TfIdf()
145
+ >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
146
+ 0.7071067811865475
147
+ """
148
+ return self.get_raw_score(bag1, bag2)
149
+
150
+ def get_dampen(self):
151
+ """Get dampen flag.
152
+
153
+ Returns:
154
+ dampen flag (boolean).
155
+ """
156
+ return self.dampen
157
+
158
+ def get_corpus_list(self):
159
+ """Get corpus list.
160
+
161
+ Returns:
162
+ corpus list (list of lists).
163
+ """
164
+ return self.__corpus_list
165
+
166
+ def set_dampen(self, dampen):
167
+ """Set dampen flag.
168
+
169
+ Args:
170
+ dampen (boolean): Flag to indicate whether 'log' should be applied to TF and IDF formulas.
171
+ """
172
+ self.dampen = dampen
173
+ return True
174
+
175
+ def set_corpus_list(self, corpus_list):
176
+ """Set corpus list.
177
+
178
+ Args:
179
+ corpus_list (list of lists): Corpus list.
180
+ """
181
+ self.__corpus_list = corpus_list
182
+ self.__document_frequency = {}
183
+ self.__compute_document_frequency()
184
+ self.__corpus_size = 0 if self.__corpus_list is None else (
185
+ len(self.__corpus_list))
186
+ return True
187
+
188
+ def __compute_document_frequency(self):
189
+ if self.__corpus_list != None:
190
+ for document in self.__corpus_list:
191
+ for element in set(document):
192
+ self.__document_frequency[element] = (
193
+ self.__document_frequency.get(element, 0) + 1)
@@ -0,0 +1,7 @@
1
+ """Token based similarity measure"""
2
+
3
+ from py_stringmatching.similarity_measure.similarity_measure import \
4
+ SimilarityMeasure
5
+
6
+ class TokenSimilarityMeasure(SimilarityMeasure):
7
+ pass