py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,305 @@
1
+ from __future__ import unicode_literals
2
+
3
+ import unittest
4
+ from nose.tools import *
5
+
6
+ from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer
7
+ from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer
8
+ from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
9
+ from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
10
+ from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer
11
+
12
+
13
+ class QgramTokenizerTestCases(unittest.TestCase):
14
+ def setUp(self):
15
+ self.qg1_tok = QgramTokenizer(1)
16
+ self.qg2_tok = QgramTokenizer()
17
+ self.qg2_tok_return_set = QgramTokenizer(return_set=True)
18
+ self.qg3_tok = QgramTokenizer(3)
19
+
20
+ def test_qgrams_valid(self):
21
+ self.assertEqual(self.qg2_tok.tokenize(''), [])
22
+ self.assertEqual(self.qg2_tok.tokenize('a'), [])
23
+ self.assertEqual(self.qg2_tok.tokenize('aa'), ['aa'])
24
+ self.assertEqual(self.qg2_tok.tokenize('database'),
25
+ ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
26
+ self.assertEqual(self.qg2_tok.tokenize('aabaabcdba'),
27
+ ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba'])
28
+ self.assertEqual(self.qg2_tok_return_set.tokenize('aabaabcdba'),
29
+ ['aa', 'ab', 'ba', 'bc', 'cd', 'db'])
30
+ self.assertEqual(self.qg1_tok.tokenize('d'), ['d'])
31
+ self.assertEqual(self.qg3_tok.tokenize('database'),
32
+ ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'])
33
+
34
+ def test_get_return_set(self):
35
+ self.assertEqual(self.qg2_tok.get_return_set(), False)
36
+ self.assertEqual(self.qg2_tok_return_set.get_return_set(), True)
37
+
38
+ def test_get_qval(self):
39
+ self.assertEqual(self.qg2_tok.get_qval(), 2)
40
+ self.assertEqual(self.qg3_tok.get_qval(), 3)
41
+
42
+ def test_set_return_set(self):
43
+ tok = QgramTokenizer()
44
+ self.assertEqual(tok.get_return_set(), False)
45
+ self.assertEqual(tok.tokenize('aabaabcdba'),
46
+ ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba'])
47
+ self.assertEqual(tok.set_return_set(True), True)
48
+ self.assertEqual(tok.get_return_set(), True)
49
+ self.assertEqual(tok.tokenize('aabaabcdba'),
50
+ ['aa', 'ab', 'ba', 'bc', 'cd', 'db'])
51
+ self.assertEqual(tok.set_return_set(False), True)
52
+ self.assertEqual(tok.get_return_set(), False)
53
+ self.assertEqual(tok.tokenize('aabaabcdba'),
54
+ ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba'])
55
+
56
+ def test_set_qval(self):
57
+ tok = QgramTokenizer()
58
+ self.assertEqual(tok.get_qval(), 2)
59
+ self.assertEqual(tok.tokenize('database'),
60
+ ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
61
+ self.assertEqual(tok.set_qval(3), True)
62
+ self.assertEqual(tok.get_qval(), 3)
63
+ self.assertEqual(tok.tokenize('database'),
64
+ ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'])
65
+
66
+ @raises(TypeError)
67
+ def test_qgrams_none(self):
68
+ self.qg2_tok.tokenize(None)
69
+
70
+ @raises(AssertionError)
71
+ def test_qgrams_invalid1(self):
72
+ invalid_qg_tok = QgramTokenizer(0)
73
+
74
+ @raises(TypeError)
75
+ def test_qgrams_invalid2(self):
76
+ self.qg2_tok.tokenize(99)
77
+
78
+ @raises(AssertionError)
79
+ def test_set_qval_invalid(self):
80
+ qg_tok = QgramTokenizer()
81
+ qg_tok.set_qval(0)
82
+
83
+
84
+ class DelimiterTokenizerTestCases(unittest.TestCase):
85
+ def setUp(self):
86
+ self.delim_tok1 = DelimiterTokenizer()
87
+ self.delim_tok2 = DelimiterTokenizer(set([',']))
88
+ self.delim_tok3 = DelimiterTokenizer(set(['*', '.']))
89
+ self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab']))
90
+ self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..'])
91
+ self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']),
92
+ return_set=True)
93
+
94
+ def test_delimiter_valid(self):
95
+ self.assertEqual(self.delim_tok1.tokenize('data science'),
96
+ ['data', 'science'])
97
+ self.assertEqual(self.delim_tok2.tokenize('data,science'),
98
+ ['data', 'science'])
99
+ self.assertEqual(self.delim_tok2.tokenize('data science'),
100
+ ['data science'])
101
+ self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'),
102
+ ['ab cd', 'ef', 'bb', ' gg'])
103
+ self.assertEqual(
104
+ self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'),
105
+ [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
106
+ self.assertEqual(
107
+ self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'),
108
+ [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
109
+ self.assertEqual(
110
+ self.delim_tok4_return_set.tokenize(
111
+ 'ab cd..efabbb....ggab cd..efabgh'),
112
+ [' cd', 'ef', 'bb', 'gg', 'gh'])
113
+
114
+ def test_get_return_set(self):
115
+ self.assertEqual(self.delim_tok4.get_return_set(), False)
116
+ self.assertEqual(self.delim_tok4_return_set.get_return_set(), True)
117
+
118
+ def test_get_delim_set(self):
119
+ self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '})
120
+ self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'})
121
+ self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'})
122
+
123
+ def test_set_return_set(self):
124
+ tok = DelimiterTokenizer(set(['..', 'ab']))
125
+ self.assertEqual(tok.get_return_set(), False)
126
+ self.assertEqual(
127
+ tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
128
+ [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
129
+ self.assertEqual(tok.set_return_set(True), True)
130
+ self.assertEqual(tok.get_return_set(), True)
131
+ self.assertEqual(
132
+ tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
133
+ [' cd', 'ef', 'bb', 'gg', 'gh'])
134
+ self.assertEqual(tok.set_return_set(False), True)
135
+ self.assertEqual(tok.get_return_set(), False)
136
+ self.assertEqual(
137
+ tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
138
+ [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
139
+
140
+ def test_set_delim_set(self):
141
+ tok = DelimiterTokenizer(['*', '.'])
142
+ self.assertSetEqual(tok.get_delim_set(), {'*', '.'})
143
+ self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'),
144
+ ['ab cd', 'ef', 'bb', ' gg'])
145
+ self.assertEqual(tok.set_delim_set({'..', 'ab'}), True)
146
+ self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'})
147
+ self.assertEqual(
148
+ tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
149
+ [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
150
+
151
+ @raises(TypeError)
152
+ def test_delimiter_invalid1(self):
153
+ invalid_delim_tok = DelimiterTokenizer(set([',', 10]))
154
+
155
+ @raises(TypeError)
156
+ def test_delimiter_invalid2(self):
157
+ self.delim_tok1.tokenize(None)
158
+
159
+ @raises(TypeError)
160
+ def test_delimiter_invalid3(self):
161
+ self.delim_tok1.tokenize(99)
162
+
163
+
164
+ class WhitespaceTokenizerTestCases(unittest.TestCase):
165
+ def setUp(self):
166
+ self.ws_tok = WhitespaceTokenizer()
167
+ self.ws_tok_return_set = WhitespaceTokenizer(return_set=True)
168
+
169
+ def test_whitespace_tok_valid(self):
170
+ self.assertEqual(self.ws_tok.tokenize('data science'),
171
+ ['data', 'science'])
172
+ self.assertEqual(self.ws_tok.tokenize('data science'),
173
+ ['data', 'science'])
174
+ self.assertEqual(self.ws_tok.tokenize('data science'),
175
+ ['data', 'science'])
176
+ self.assertEqual(self.ws_tok.tokenize('data\tscience'),
177
+ ['data', 'science'])
178
+ self.assertEqual(self.ws_tok.tokenize('data\nscience'),
179
+ ['data', 'science'])
180
+ self.assertEqual(self.ws_tok.tokenize('ab cd ab bb cd db'),
181
+ ['ab', 'cd', 'ab', 'bb', 'cd', 'db'])
182
+ self.assertEqual(self.ws_tok_return_set.tokenize('ab cd ab bb cd db'),
183
+ ['ab', 'cd', 'bb', 'db'])
184
+
185
+ def test_get_return_set(self):
186
+ self.assertEqual(self.ws_tok.get_return_set(), False)
187
+ self.assertEqual(self.ws_tok_return_set.get_return_set(), True)
188
+
189
+ def test_set_return_set(self):
190
+ tok = WhitespaceTokenizer()
191
+ self.assertEqual(tok.get_return_set(), False)
192
+ self.assertEqual(tok.tokenize('ab cd ab bb cd db'),
193
+ ['ab', 'cd', 'ab', 'bb', 'cd', 'db'])
194
+ self.assertEqual(tok.set_return_set(True), True)
195
+ self.assertEqual(tok.get_return_set(), True)
196
+ self.assertEqual(tok.tokenize('ab cd ab bb cd db'),
197
+ ['ab', 'cd', 'bb', 'db'])
198
+ self.assertEqual(tok.set_return_set(False), True)
199
+ self.assertEqual(tok.get_return_set(), False)
200
+ self.assertEqual(tok.tokenize('ab cd ab bb cd db'),
201
+ ['ab', 'cd', 'ab', 'bb', 'cd', 'db'])
202
+
203
+ def test_get_delim_set(self):
204
+ self.assertSetEqual(self.ws_tok.get_delim_set(), {' ', '\t', '\n'})
205
+
206
+ @raises(TypeError)
207
+ def test_whitespace_tok_invalid1(self):
208
+ self.ws_tok.tokenize(None)
209
+
210
+ @raises(TypeError)
211
+ def test_whitespace_tok_invalid2(self):
212
+ self.ws_tok.tokenize(99)
213
+
214
+ @raises(AttributeError)
215
+ def test_set_delim_set(self):
216
+ self.ws_tok.set_delim_set({'*', '.'})
217
+
218
+
219
+ class AlphabeticTokenizerTestCases(unittest.TestCase):
220
+ def setUp(self):
221
+ self.al_tok = AlphabeticTokenizer()
222
+ self.al_tok_return_set = AlphabeticTokenizer(return_set=True)
223
+
224
+ def test_alphabetic_tok_valid(self):
225
+ self.assertEqual(self.al_tok.tokenize(''), [])
226
+ self.assertEqual(self.al_tok.tokenize('99'), [])
227
+ self.assertEqual(self.al_tok.tokenize('hello'), ['hello'])
228
+ self.assertEqual(self.al_tok.tokenize('ab bc. cd##de ef09 bc fg ab.'),
229
+ ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab'])
230
+ self.assertEqual(
231
+ self.al_tok_return_set.tokenize('ab bc. cd##de ef09 bc fg ab.'),
232
+ ['ab', 'bc', 'cd', 'de', 'ef', 'fg'])
233
+
234
+ def test_get_return_set(self):
235
+ self.assertEqual(self.al_tok.get_return_set(), False)
236
+ self.assertEqual(self.al_tok_return_set.get_return_set(), True)
237
+
238
+ def test_set_return_set(self):
239
+ tok = AlphabeticTokenizer()
240
+ self.assertEqual(tok.get_return_set(), False)
241
+ self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'),
242
+ ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab'])
243
+ self.assertEqual(tok.set_return_set(True), True)
244
+ self.assertEqual(tok.get_return_set(), True)
245
+ self.assertEqual(
246
+ tok.tokenize('ab bc. cd##de ef09 bc fg ab.'),
247
+ ['ab', 'bc', 'cd', 'de', 'ef', 'fg'])
248
+ self.assertEqual(tok.set_return_set(False), True)
249
+ self.assertEqual(tok.get_return_set(), False)
250
+ self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'),
251
+ ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab'])
252
+
253
+ @raises(TypeError)
254
+ def test_alphabetic_tok_invalid1(self):
255
+ self.al_tok.tokenize(None)
256
+
257
+ @raises(TypeError)
258
+ def test_alphabetic_tok_invalid2(self):
259
+ self.al_tok.tokenize(99)
260
+
261
+
262
+ class AlphanumericTokenizerTestCases(unittest.TestCase):
263
+ def setUp(self):
264
+ self.alnum_tok = AlphanumericTokenizer()
265
+ self.alnum_tok_return_set = AlphanumericTokenizer(return_set=True)
266
+
267
+ def test_alphanumeric_tok_valid(self):
268
+ self.assertEqual(self.alnum_tok.tokenize(''), [])
269
+ self.assertEqual(self.alnum_tok.tokenize('#$'), [])
270
+ self.assertEqual(self.alnum_tok.tokenize('hello99'), ['hello99'])
271
+ self.assertEqual(
272
+ self.alnum_tok.tokenize(',data9,(science), data9#.(integration).88!'),
273
+ ['data9', 'science', 'data9', 'integration', '88'])
274
+ self.assertEqual(self.alnum_tok_return_set.tokenize(
275
+ ',data9,(science), data9#.(integration).88!'),
276
+ ['data9', 'science', 'integration', '88'])
277
+
278
+ def test_get_return_set(self):
279
+ self.assertEqual(self.alnum_tok.get_return_set(), False)
280
+ self.assertEqual(self.alnum_tok_return_set.get_return_set(), True)
281
+
282
+ def test_set_return_set(self):
283
+ tok = AlphanumericTokenizer()
284
+ self.assertEqual(tok.get_return_set(), False)
285
+ self.assertEqual(
286
+ tok.tokenize(',data9,(science), data9#.(integration).88!'),
287
+ ['data9', 'science', 'data9', 'integration', '88'])
288
+ self.assertEqual(tok.set_return_set(True), True)
289
+ self.assertEqual(tok.get_return_set(), True)
290
+ self.assertEqual(
291
+ tok.tokenize(',data9,(science), data9#.(integration).88!'),
292
+ ['data9', 'science', 'integration', '88'])
293
+ self.assertEqual(tok.set_return_set(False), True)
294
+ self.assertEqual(tok.get_return_set(), False)
295
+ self.assertEqual(
296
+ tok.tokenize(',data9,(science), data9#.(integration).88!'),
297
+ ['data9', 'science', 'data9', 'integration', '88'])
298
+
299
+ @raises(TypeError)
300
+ def test_alphanumeric_tok_invalid1(self):
301
+ self.alnum_tok.tokenize(None)
302
+
303
+ @raises(TypeError)
304
+ def test_alphanumeric_tok_invalid2(self):
305
+ self.alnum_tok.tokenize(99)
@@ -0,0 +1,51 @@
1
+ import re
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
5
+
6
+
7
+ class AlphabeticTokenizer(DefinitionTokenizer):
8
+ """Returns tokens that are maximal sequences of consecutive alphabetical characters.
9
+
10
+ Args:
11
+ return_set (boolean): A flag to indicate whether to return a set of tokens instead of a bag of tokens (defaults to False).
12
+
13
+ Attributes:
14
+ return_set (boolean): An attribute that stores the value for the flag return_set.
15
+ """
16
+
17
+ def __init__(self, return_set=False):
18
+ self.__al_regex = re.compile('[a-zA-Z]+')
19
+ super(AlphabeticTokenizer, self).__init__(return_set)
20
+
21
+ def tokenize(self, input_string):
22
+ """Tokenizes input string into alphabetical tokens.
23
+
24
+ Args:
25
+ input_string (str): The string to be tokenized.
26
+
27
+ Returns:
28
+ A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise.
29
+
30
+ Raises:
31
+ TypeError : If the input is not a string.
32
+
33
+ Examples:
34
+ >>> al_tok = AlphabeticTokenizer()
35
+ >>> al_tok.tokenize('data99science, data#integration.')
36
+ ['data', 'science', 'data', 'integration']
37
+ >>> al_tok.tokenize('99')
38
+ []
39
+ >>> al_tok = AlphabeticTokenizer(return_set=True)
40
+ >>> al_tok.tokenize('data99science, data#integration.')
41
+ ['data', 'science', 'integration']
42
+ """
43
+ utils.tok_check_for_none(input_string)
44
+ utils.tok_check_for_string_input(input_string)
45
+
46
+ token_list = list(filter(None, self.__al_regex.findall(input_string)))
47
+
48
+ if self.return_set:
49
+ return utils.convert_bag_to_set(token_list)
50
+
51
+ return token_list
@@ -0,0 +1,54 @@
1
+ import re
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
5
+
6
+
7
+ class AlphanumericTokenizer(DefinitionTokenizer):
8
+ """Returns tokens that are maximal sequences of consecutive alphanumeric characters.
9
+
10
+ Args:
11
+ return_set (boolean): A flag to indicate whether to return a set of
12
+ tokens instead of a bag of tokens (defaults to False).
13
+
14
+ Attributes:
15
+ return_set (boolean): An attribute to store the value of the flag return_set.
16
+ """
17
+
18
+ def __init__(self, return_set=False):
19
+ self.__alnum_regex = re.compile('[a-zA-Z0-9]+')
20
+ super(AlphanumericTokenizer, self).__init__(return_set)
21
+
22
+ def tokenize(self, input_string):
23
+ """Tokenizes input string into alphanumeric tokens.
24
+
25
+ Args:
26
+ input_string (str): The string to be tokenized.
27
+
28
+ Returns:
29
+ A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise.
30
+
31
+ Raises:
32
+ TypeError : If the input is not a string.
33
+
34
+ Examples:
35
+ >>> alnum_tok = AlphanumericTokenizer()
36
+ >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
37
+ ['data9', 'science', 'data9', 'integration', '88']
38
+ >>> alnum_tok.tokenize('#.&')
39
+ []
40
+ >>> alnum_tok = AlphanumericTokenizer(return_set=True)
41
+ >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
42
+ ['data9', 'science', 'integration', '88']
43
+
44
+ """
45
+ utils.tok_check_for_none(input_string)
46
+ utils.tok_check_for_string_input(input_string)
47
+
48
+ token_list = list(filter(None,
49
+ self.__alnum_regex.findall(input_string)))
50
+
51
+ if self.return_set:
52
+ return utils.convert_bag_to_set(token_list)
53
+
54
+ return token_list
@@ -0,0 +1,18 @@
1
+ from py_stringmatching.tokenizer.tokenizer import Tokenizer
2
+
3
+
4
+ class DefinitionTokenizer(Tokenizer):
5
+ """A class of tokenizers that uses a definition to find tokens, as opposed to using delimiters.
6
+
7
+ Examples of definitions include alphabetical tokens, qgram tokens. Examples of delimiters include white space, punctuations.
8
+
9
+ Args:
10
+ return_set (boolean): A flag to indicate whether to return a set of
11
+ tokens instead of a bag of tokens (defaults to False).
12
+
13
+ Attributes:
14
+ return_set (boolean): An attribute to store the flag return_set.
15
+ """
16
+
17
+ def __init__(self, return_set=False):
18
+ super(DefinitionTokenizer, self).__init__(return_set)
@@ -0,0 +1,99 @@
1
+ import re
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.tokenizer.tokenizer import Tokenizer
5
+
6
+
7
+ class DelimiterTokenizer(Tokenizer):
8
+ """Uses delimiters to find tokens, as apposed to using definitions.
9
+
10
+ Examples of delimiters include white space and punctuations. Examples of definitions include alphabetical and qgram tokens.
11
+
12
+ Args:
13
+ delim_set (set): A set of delimiter strings (defaults to space delimiter).
14
+ return_set (boolean): A flag to indicate whether to return a set of
15
+ tokens instead of a bag of tokens (defaults to False).
16
+
17
+ Attributes:
18
+ return_set (boolean): An attribute to store the value of the flag return_set.
19
+ """
20
+
21
+ def __init__(self, delim_set=set([' ']), return_set=False):
22
+ self.__delim_set = None
23
+ self.__use_split = None
24
+ self.__delim_str = None
25
+ self.__delim_regex = None
26
+ self._update_delim_set(delim_set)
27
+ super(DelimiterTokenizer, self).__init__(return_set)
28
+
29
+ def tokenize(self, input_string):
30
+ """Tokenizes input string based on the set of delimiters.
31
+
32
+ Args:
33
+ input_string (str): The string to be tokenized.
34
+
35
+ Returns:
36
+ A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False.
37
+
38
+ Raises:
39
+ TypeError : If the input is not a string.
40
+
41
+ Examples:
42
+ >>> delim_tok = DelimiterTokenizer()
43
+ >>> delim_tok.tokenize('data science')
44
+ ['data', 'science']
45
+ >>> delim_tok = DelimiterTokenizer(['$#$'])
46
+ >>> delim_tok.tokenize('data$#$science')
47
+ ['data', 'science']
48
+ >>> delim_tok = DelimiterTokenizer([',', '.'])
49
+ >>> delim_tok.tokenize('data,science.data,integration.')
50
+ ['data', 'science', 'data', 'integration']
51
+ >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True)
52
+ >>> delim_tok.tokenize('data,science.data,integration.')
53
+ ['data', 'science', 'integration']
54
+
55
+ """
56
+ utils.tok_check_for_none(input_string)
57
+ utils.tok_check_for_string_input(input_string)
58
+
59
+ if self.__use_split:
60
+ token_list = list(filter(None,
61
+ input_string.split(self.__delim_str)))
62
+ else:
63
+ token_list = list(filter(None,
64
+ self.__delim_regex.split(input_string)))
65
+
66
+ if self.return_set:
67
+ return utils.convert_bag_to_set(token_list)
68
+
69
+ return token_list
70
+
71
+ def get_delim_set(self):
72
+ """Gets the current set of delimiters.
73
+
74
+ Returns:
75
+ A Python set which is the current set of delimiters.
76
+ """
77
+ return self.__delim_set
78
+
79
+ def set_delim_set(self, delim_set):
80
+ """Sets the current set of delimiters.
81
+
82
+ Args:
83
+ delim_set (set): A set of delimiter strings.
84
+ """
85
+ return self._update_delim_set(delim_set)
86
+
87
+ def _update_delim_set(self, delim_set):
88
+ if not isinstance(delim_set, set):
89
+ delim_set = set(delim_set)
90
+ self.__delim_set = delim_set
91
+ # if there is only one delimiter string, use split instead of regex
92
+ self.__use_split = False
93
+ if len(self.__delim_set) == 1:
94
+ self.__delim_str = list(self.__delim_set)[0]
95
+ self.__use_split = True
96
+ else:
97
+ self.__delim_regex = re.compile('|'.join(
98
+ map(re.escape, self.__delim_set)))
99
+ return True
@@ -0,0 +1,90 @@
1
+ from py_stringmatching import utils
2
+ from six.moves import xrange
3
+ from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
4
+
5
+
6
+ class QgramTokenizer(DefinitionTokenizer):
7
+ """Returns tokens that are sequences of q consecutive characters.
8
+
9
+ A qgram of an input string s is a substring t (of s) which is a sequence of q consecutive characters. Qgrams are also known as
10
+ ngrams or kgrams.
11
+
12
+ Args:
13
+ qval (int): A value for q, that is, the qgram's length (defaults to 2).
14
+ return_set (boolean): A flag to indicate whether to return a set of
15
+ tokens or a bag of tokens (defaults to False).
16
+
17
+ Attributes:
18
+ qval (int): An attribute to store the q value.
19
+ return_set (boolean): An attribute to store the flag return_set.
20
+ """
21
+
22
+ def __init__(self, qval=2, return_set=False):
23
+ if qval < 1:
24
+ raise AssertionError("qval cannot be less than 1")
25
+ self.qval = qval
26
+ super(QgramTokenizer, self).__init__(return_set)
27
+
28
+ def tokenize(self, input_string):
29
+ """Tokenizes input string into qgrams.
30
+
31
+ Args:
32
+ input_string (str): The string to be tokenized.
33
+
34
+ Returns:
35
+ A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False.
36
+
37
+ Raises:
38
+ TypeError : If the input is not a string
39
+
40
+ Examples:
41
+ >>> qg2_tok = QgramTokenizer()
42
+ >>> qg2_tok.tokenize('database')
43
+ ['da','at','ta','ab','ba','as','se']
44
+ >>> qg2_tok.tokenize('a')
45
+ []
46
+ >>> qg3_tok = QgramTokenizer(3)
47
+ >>> qg3_tok.tokenize('database')
48
+ ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']
49
+
50
+ As these examples show, the current qgram tokenizer does not consider the case of appending #s at the
51
+ start and the end of the input string. This is left for future work.
52
+ """
53
+ utils.tok_check_for_none(input_string)
54
+ utils.tok_check_for_string_input(input_string)
55
+
56
+ qgram_list = []
57
+
58
+ if len(input_string) < self.qval:
59
+ return qgram_list
60
+
61
+ qgram_list = [input_string[i:i + self.qval] for i in
62
+ xrange(len(input_string) - (self.qval - 1))]
63
+ qgram_list = list(filter(None, qgram_list))
64
+
65
+ if self.return_set:
66
+ return utils.convert_bag_to_set(qgram_list)
67
+
68
+ return qgram_list
69
+
70
+ def get_qval(self):
71
+ """Gets the value of the qval attribute, which is the length of qgrams.
72
+
73
+ Returns:
74
+ The value of the qval attribute.
75
+ """
76
+ return self.qval
77
+
78
+ def set_qval(self, qval):
79
+ """Sets the value of the qval attribute.
80
+
81
+ Args:
82
+ qval (int): A value for q (the length of qgrams).
83
+
84
+ Raises:
85
+ AssertionError : If qval is less than 1.
86
+ """
87
+ if qval < 1:
88
+ raise AssertionError("qval cannot be less than 1")
89
+ self.qval = qval
90
+ return True
@@ -0,0 +1,30 @@
1
+ class Tokenizer(object):
2
+ """The root class for tokenizers.
3
+
4
+ Args:
5
+ return_set (boolean): A flag to indicate whether to return a set of
6
+ tokens instead of a bag of tokens (defaults to False).
7
+
8
+ Attributes:
9
+ return_set (boolean): An attribute to store the flag return_set.
10
+ """
11
+
12
+ def __init__(self, return_set=False):
13
+ self.return_set = return_set
14
+
15
+ def get_return_set(self):
16
+ """Gets the value of the return_set flag.
17
+
18
+ Returns:
19
+ The boolean value of the return_set flag.
20
+ """
21
+ return self.return_set
22
+
23
+ def set_return_set(self, return_set):
24
+ """Sets the value of the return_set flag.
25
+
26
+ Args:
27
+ return_set (boolean): a flag to indicate whether to return a set of tokens instead of a bag of tokens.
28
+ """
29
+ self.return_set = return_set
30
+ return True