py-stringmatching 0.1.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_stringmatching-0.1.0/AUTHORS.rst +6 -0
- py_stringmatching-0.1.0/CHANGES.txt +6 -0
- py_stringmatching-0.1.0/LICENSE +27 -0
- py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
- py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
- py_stringmatching-0.1.0/MANIFEST.in +6 -0
- py_stringmatching-0.1.0/PKG-INFO +57 -0
- py_stringmatching-0.1.0/README.rst +27 -0
- py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
- py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
- py_stringmatching-0.1.0/requirements.txt +2 -0
- py_stringmatching-0.1.0/setup.cfg +5 -0
- py_stringmatching-0.1.0/setup.py +107 -0
|
@@ -0,0 +1,1249 @@
|
|
|
1
|
+
from __future__ import unicode_literals
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from nose.tools import *
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
# sequence based similarity measures
|
|
9
|
+
from py_stringmatching.similarity_measure.affine import Affine
|
|
10
|
+
from py_stringmatching.similarity_measure.hamming_distance import HammingDistance
|
|
11
|
+
from py_stringmatching.similarity_measure.jaro import Jaro
|
|
12
|
+
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
|
|
13
|
+
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
|
|
14
|
+
from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
|
|
15
|
+
from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
|
|
16
|
+
# token based similarity measures
|
|
17
|
+
from py_stringmatching.similarity_measure.cosine import Cosine
|
|
18
|
+
from py_stringmatching.similarity_measure.dice import Dice
|
|
19
|
+
from py_stringmatching.similarity_measure.jaccard import Jaccard
|
|
20
|
+
from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
|
|
21
|
+
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
|
|
22
|
+
from py_stringmatching.similarity_measure.tfidf import TfIdf
|
|
23
|
+
# hybrid similarity measures
|
|
24
|
+
from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------- sequence based similarity measures ----------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AffineTestCases(unittest.TestCase):
|
|
31
|
+
def setUp(self):
|
|
32
|
+
self.affine = Affine()
|
|
33
|
+
self.affine_with_params1 = Affine(gap_start=2, gap_continuation=0.5)
|
|
34
|
+
self.sim_func = lambda s1, s2: (int(1 if s1 == s2 else 0))
|
|
35
|
+
self.affine_with_params2 = Affine(gap_continuation=0.2, sim_func=self.sim_func)
|
|
36
|
+
|
|
37
|
+
def test_valid_input(self):
|
|
38
|
+
self.assertAlmostEqual(self.affine.get_raw_score('dva', 'deeva'), 1.5)
|
|
39
|
+
self.assertAlmostEqual(self.affine_with_params1.get_raw_score('dva', 'deeve'), -0.5)
|
|
40
|
+
self.assertAlmostEqual(self.affine_with_params2.get_raw_score('AAAGAATTCA', 'AAATCA'),
|
|
41
|
+
4.4)
|
|
42
|
+
self.assertAlmostEqual(self.affine_with_params2.get_raw_score(' ', ' '), 1)
|
|
43
|
+
self.assertEqual(self.affine.get_raw_score('', 'deeva'), 0)
|
|
44
|
+
|
|
45
|
+
def test_get_gap_start(self):
|
|
46
|
+
self.assertEqual(self.affine_with_params1.get_gap_start(), 2)
|
|
47
|
+
|
|
48
|
+
def test_get_gap_continuation(self):
|
|
49
|
+
self.assertEqual(self.affine_with_params2.get_gap_continuation(), 0.2)
|
|
50
|
+
|
|
51
|
+
def test_get_sim_func(self):
|
|
52
|
+
self.assertEqual(self.affine_with_params2.get_sim_func(), self.sim_func)
|
|
53
|
+
|
|
54
|
+
def test_set_gap_start(self):
|
|
55
|
+
af = Affine(gap_start=1)
|
|
56
|
+
self.assertEqual(af.get_gap_start(), 1)
|
|
57
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5)
|
|
58
|
+
self.assertEqual(af.set_gap_start(2), True)
|
|
59
|
+
self.assertEqual(af.get_gap_start(), 2)
|
|
60
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 0.5)
|
|
61
|
+
|
|
62
|
+
def test_set_gap_continuation(self):
|
|
63
|
+
af = Affine(gap_continuation=0.3)
|
|
64
|
+
self.assertEqual(af.get_gap_continuation(), 0.3)
|
|
65
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.7)
|
|
66
|
+
self.assertEqual(af.set_gap_continuation(0.7), True)
|
|
67
|
+
self.assertEqual(af.get_gap_continuation(), 0.7)
|
|
68
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.3)
|
|
69
|
+
|
|
70
|
+
def test_set_sim_func(self):
|
|
71
|
+
fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0))
|
|
72
|
+
fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1))
|
|
73
|
+
af = Affine(sim_func=fn1)
|
|
74
|
+
self.assertEqual(af.get_sim_func(), fn1)
|
|
75
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5)
|
|
76
|
+
self.assertEqual(af.set_sim_func(fn2), True)
|
|
77
|
+
self.assertEqual(af.get_sim_func(), fn2)
|
|
78
|
+
self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 4.5)
|
|
79
|
+
|
|
80
|
+
@raises(TypeError)
|
|
81
|
+
def test_invalid_input1_raw_score(self):
|
|
82
|
+
self.affine.get_raw_score(None, 'MARHTA')
|
|
83
|
+
|
|
84
|
+
@raises(TypeError)
|
|
85
|
+
def test_invalid_input2_raw_score(self):
|
|
86
|
+
self.affine.get_raw_score('MARHTA', None)
|
|
87
|
+
|
|
88
|
+
@raises(TypeError)
|
|
89
|
+
def test_invalid_input3_raw_score(self):
|
|
90
|
+
self.affine.get_raw_score('MARHTA', 12.90)
|
|
91
|
+
|
|
92
|
+
@raises(TypeError)
|
|
93
|
+
def test_invalid_input4_raw_score(self):
|
|
94
|
+
self.affine.get_raw_score(12.90, 'MARTHA')
|
|
95
|
+
|
|
96
|
+
@raises(TypeError)
|
|
97
|
+
def test_invalid_input5_raw_score(self):
|
|
98
|
+
self.affine.get_raw_score(None, None)
|
|
99
|
+
|
|
100
|
+
@raises(TypeError)
|
|
101
|
+
def test_invalid_input6_raw_score(self):
|
|
102
|
+
self.affine.get_raw_score(12.90, 12.90)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class JaroTestCases(unittest.TestCase):
|
|
107
|
+
def setUp(self):
|
|
108
|
+
self.jaro = Jaro()
|
|
109
|
+
|
|
110
|
+
def test_valid_input_raw_score(self):
|
|
111
|
+
# https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
|
|
112
|
+
self.assertAlmostEqual(self.jaro.get_raw_score('MARTHA', 'MARHTA'),
|
|
113
|
+
0.9444444444444445)
|
|
114
|
+
self.assertAlmostEqual(self.jaro.get_raw_score('DWAYNE', 'DUANE'),
|
|
115
|
+
0.8222222222222223)
|
|
116
|
+
self.assertAlmostEqual(self.jaro.get_raw_score('DIXON', 'DICKSONX'),
|
|
117
|
+
0.7666666666666666)
|
|
118
|
+
self.assertEqual(self.jaro.get_raw_score('', 'deeva'), 0)
|
|
119
|
+
|
|
120
|
+
def test_valid_input_sim_score(self):
|
|
121
|
+
self.assertAlmostEqual(self.jaro.get_sim_score('MARTHA', 'MARHTA'),
|
|
122
|
+
0.9444444444444445)
|
|
123
|
+
self.assertAlmostEqual(self.jaro.get_sim_score('DWAYNE', 'DUANE'),
|
|
124
|
+
0.8222222222222223)
|
|
125
|
+
self.assertAlmostEqual(self.jaro.get_sim_score('DIXON', 'DICKSONX'),
|
|
126
|
+
0.7666666666666666)
|
|
127
|
+
self.assertEqual(self.jaro.get_sim_score('', 'deeva'), 0)
|
|
128
|
+
|
|
129
|
+
@raises(TypeError)
|
|
130
|
+
def test_invalid_input1_raw_score(self):
|
|
131
|
+
self.jaro.get_raw_score(None, 'MARHTA')
|
|
132
|
+
|
|
133
|
+
@raises(TypeError)
|
|
134
|
+
def test_invalid_input2_raw_score(self):
|
|
135
|
+
self.jaro.get_raw_score('MARHTA', None)
|
|
136
|
+
|
|
137
|
+
@raises(TypeError)
|
|
138
|
+
def test_invalid_input3_raw_score(self):
|
|
139
|
+
self.jaro.get_raw_score(None, None)
|
|
140
|
+
|
|
141
|
+
@raises(TypeError)
|
|
142
|
+
def test_invalid_input4_raw_score(self):
|
|
143
|
+
self.jaro.get_raw_score('MARHTA', 12.90)
|
|
144
|
+
|
|
145
|
+
@raises(TypeError)
|
|
146
|
+
def test_invalid_input5_raw_score(self):
|
|
147
|
+
self.jaro.get_raw_score(12.90, 'MARTHA')
|
|
148
|
+
|
|
149
|
+
@raises(TypeError)
|
|
150
|
+
def test_invalid_input6_raw_score(self):
|
|
151
|
+
self.jaro.get_raw_score(12.90, 12.90)
|
|
152
|
+
|
|
153
|
+
@raises(TypeError)
|
|
154
|
+
def test_invalid_input1_sim_score(self):
|
|
155
|
+
self.jaro.get_sim_score(None, 'MARHTA')
|
|
156
|
+
|
|
157
|
+
@raises(TypeError)
|
|
158
|
+
def test_invalid_input2_sim_score(self):
|
|
159
|
+
self.jaro.get_sim_score('MARHTA', None)
|
|
160
|
+
|
|
161
|
+
@raises(TypeError)
|
|
162
|
+
def test_invalid_input3_sim_score(self):
|
|
163
|
+
self.jaro.get_sim_score(None, None)
|
|
164
|
+
|
|
165
|
+
@raises(TypeError)
|
|
166
|
+
def test_invalid_input4_sim_score(self):
|
|
167
|
+
self.jaro.get_sim_score('MARHTA', 12.90)
|
|
168
|
+
|
|
169
|
+
@raises(TypeError)
|
|
170
|
+
def test_invalid_input5_sim_score(self):
|
|
171
|
+
self.jaro.get_sim_score(12.90, 'MARTHA')
|
|
172
|
+
|
|
173
|
+
@raises(TypeError)
|
|
174
|
+
def test_invalid_input6_sim_score(self):
|
|
175
|
+
self.jaro.get_sim_score(12.90, 12.90)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class JaroWinklerTestCases(unittest.TestCase):
|
|
179
|
+
def setUp(self):
|
|
180
|
+
self.jw = JaroWinkler()
|
|
181
|
+
|
|
182
|
+
def test_get_prefix_weight(self):
|
|
183
|
+
self.assertEqual(self.jw.get_prefix_weight(), 0.1)
|
|
184
|
+
|
|
185
|
+
def test_set_prefix_weight(self):
|
|
186
|
+
jw = JaroWinkler(prefix_weight=0.15)
|
|
187
|
+
self.assertEqual(jw.get_prefix_weight(), 0.15)
|
|
188
|
+
self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9694444444444444)
|
|
189
|
+
self.assertEqual(jw.set_prefix_weight(0.25), True)
|
|
190
|
+
self.assertEqual(jw.get_prefix_weight(), 0.25)
|
|
191
|
+
self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9861111111111112)
|
|
192
|
+
|
|
193
|
+
def test_valid_input_raw_score(self):
|
|
194
|
+
# https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
|
|
195
|
+
self.assertAlmostEqual(self.jw.get_raw_score('MARTHA', 'MARHTA'),
|
|
196
|
+
0.9611111111111111)
|
|
197
|
+
self.assertAlmostEqual(self.jw.get_raw_score('DWAYNE', 'DUANE'), 0.84)
|
|
198
|
+
self.assertAlmostEqual(self.jw.get_raw_score('DIXON', 'DICKSONX'),
|
|
199
|
+
0.8133333333333332)
|
|
200
|
+
|
|
201
|
+
def test_valid_input_sim_score(self):
|
|
202
|
+
self.assertAlmostEqual(self.jw.get_sim_score('MARTHA', 'MARHTA'),
|
|
203
|
+
0.9611111111111111)
|
|
204
|
+
self.assertAlmostEqual(self.jw.get_sim_score('DWAYNE', 'DUANE'), 0.84)
|
|
205
|
+
self.assertAlmostEqual(self.jw.get_sim_score('DIXON', 'DICKSONX'),
|
|
206
|
+
0.8133333333333332)
|
|
207
|
+
|
|
208
|
+
@raises(TypeError)
|
|
209
|
+
def test_invalid_input1_raw_score(self):
|
|
210
|
+
self.jw.get_raw_score(None, 'MARHTA')
|
|
211
|
+
|
|
212
|
+
@raises(TypeError)
|
|
213
|
+
def test_invalid_input2_raw_score(self):
|
|
214
|
+
self.jw.get_raw_score('MARHTA', None)
|
|
215
|
+
|
|
216
|
+
@raises(TypeError)
|
|
217
|
+
def test_invalid_input3_raw_score(self):
|
|
218
|
+
self.jw.get_raw_score(None, None)
|
|
219
|
+
|
|
220
|
+
@raises(TypeError)
|
|
221
|
+
def test_invalid_input4_raw_score(self):
|
|
222
|
+
self.jw.get_raw_score('MARHTA', 12.90)
|
|
223
|
+
|
|
224
|
+
@raises(TypeError)
|
|
225
|
+
def test_invalid_input5_raw_score(self):
|
|
226
|
+
self.jw.get_raw_score(12.90, 'MARTHA')
|
|
227
|
+
|
|
228
|
+
@raises(TypeError)
|
|
229
|
+
def test_invalid_input6_raw_score(self):
|
|
230
|
+
self.jw.get_raw_score(12.90, 12.90)
|
|
231
|
+
|
|
232
|
+
@raises(TypeError)
|
|
233
|
+
def test_invalid_input1_sim_score(self):
|
|
234
|
+
self.jw.get_sim_score(None, 'MARHTA')
|
|
235
|
+
|
|
236
|
+
@raises(TypeError)
|
|
237
|
+
def test_invalid_input2_sim_score(self):
|
|
238
|
+
self.jw.get_sim_score('MARHTA', None)
|
|
239
|
+
|
|
240
|
+
@raises(TypeError)
|
|
241
|
+
def test_invalid_input3_sim_score(self):
|
|
242
|
+
self.jw.get_sim_score(None, None)
|
|
243
|
+
|
|
244
|
+
@raises(TypeError)
|
|
245
|
+
def test_invalid_input4_sim_score(self):
|
|
246
|
+
self.jw.get_sim_score('MARHTA', 12.90)
|
|
247
|
+
|
|
248
|
+
@raises(TypeError)
|
|
249
|
+
def test_invalid_input5_sim_score(self):
|
|
250
|
+
self.jw.get_sim_score(12.90, 'MARTHA')
|
|
251
|
+
|
|
252
|
+
@raises(TypeError)
|
|
253
|
+
def test_invalid_input6_sim_score(self):
|
|
254
|
+
self.jw.get_sim_score(12.90, 12.90)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class LevenshteinTestCases(unittest.TestCase):
|
|
258
|
+
def setUp(self):
|
|
259
|
+
self.lev = Levenshtein()
|
|
260
|
+
|
|
261
|
+
def test_valid_input_raw_score(self):
|
|
262
|
+
# http://oldfashionedsoftware.com/tag/levenshtein-distance/
|
|
263
|
+
self.assertEqual(self.lev.get_raw_score('a', ''), 1)
|
|
264
|
+
self.assertEqual(self.lev.get_raw_score('', 'a'), 1)
|
|
265
|
+
self.assertEqual(self.lev.get_raw_score('abc', ''), 3)
|
|
266
|
+
self.assertEqual(self.lev.get_raw_score('', 'abc'), 3)
|
|
267
|
+
self.assertEqual(self.lev.get_raw_score('', ''), 0)
|
|
268
|
+
self.assertEqual(self.lev.get_raw_score('a', 'a'), 0)
|
|
269
|
+
self.assertEqual(self.lev.get_raw_score('abc', 'abc'), 0)
|
|
270
|
+
self.assertEqual(self.lev.get_raw_score('a', 'ab'), 1)
|
|
271
|
+
self.assertEqual(self.lev.get_raw_score('b', 'ab'), 1)
|
|
272
|
+
self.assertEqual(self.lev.get_raw_score('ac', 'abc'), 1)
|
|
273
|
+
self.assertEqual(self.lev.get_raw_score('abcdefg', 'xabxcdxxefxgx'), 6)
|
|
274
|
+
self.assertEqual(self.lev.get_raw_score('ab', 'a'), 1)
|
|
275
|
+
self.assertEqual(self.lev.get_raw_score('ab', 'b'), 1)
|
|
276
|
+
self.assertEqual(self.lev.get_raw_score('abc', 'ac'), 1)
|
|
277
|
+
self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', 'abcdefg'), 6)
|
|
278
|
+
self.assertEqual(self.lev.get_raw_score('a', 'b'), 1)
|
|
279
|
+
self.assertEqual(self.lev.get_raw_score('ab', 'ac'), 1)
|
|
280
|
+
self.assertEqual(self.lev.get_raw_score('ac', 'bc'), 1)
|
|
281
|
+
self.assertEqual(self.lev.get_raw_score('abc', 'axc'), 1)
|
|
282
|
+
self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6)
|
|
283
|
+
self.assertEqual(self.lev.get_raw_score('example', 'samples'), 3)
|
|
284
|
+
self.assertEqual(self.lev.get_raw_score('sturgeon', 'urgently'), 6)
|
|
285
|
+
self.assertEqual(self.lev.get_raw_score('levenshtein', 'frankenstein'), 6)
|
|
286
|
+
self.assertEqual(self.lev.get_raw_score('distance', 'difference'), 5)
|
|
287
|
+
self.assertEqual(self.lev.get_raw_score('java was neat', 'scala is great'), 7)
|
|
288
|
+
|
|
289
|
+
def test_valid_input_sim_score(self):
|
|
290
|
+
self.assertEqual(self.lev.get_sim_score('a', ''), 1.0 - (1.0 / 1.0))
|
|
291
|
+
self.assertEqual(self.lev.get_sim_score('', 'a'), 1.0 - (1.0 / 1.0))
|
|
292
|
+
self.assertEqual(self.lev.get_sim_score('abc', ''), 1.0 - (3.0 / 3.0))
|
|
293
|
+
self.assertEqual(self.lev.get_sim_score('', 'abc'), 1.0 - (3.0 / 3.0))
|
|
294
|
+
self.assertEqual(self.lev.get_sim_score('', ''), 1.0)
|
|
295
|
+
self.assertEqual(self.lev.get_sim_score('a', 'a'), 1.0)
|
|
296
|
+
self.assertEqual(self.lev.get_sim_score('abc', 'abc'), 1.0)
|
|
297
|
+
self.assertEqual(self.lev.get_sim_score('a', 'ab'), 1.0 - (1.0 / 2.0))
|
|
298
|
+
self.assertEqual(self.lev.get_sim_score('b', 'ab'), 1.0 - (1.0 / 2.0))
|
|
299
|
+
self.assertEqual(self.lev.get_sim_score('ac', 'abc'), 1.0 - (1.0 / 3.0))
|
|
300
|
+
self.assertEqual(self.lev.get_sim_score('abcdefg', 'xabxcdxxefxgx'), 1.0 - (6.0 / 13.0))
|
|
301
|
+
self.assertEqual(self.lev.get_sim_score('ab', 'a'), 1.0 - (1.0 / 2.0))
|
|
302
|
+
self.assertEqual(self.lev.get_sim_score('ab', 'b'), 1.0 - (1.0 / 2.0))
|
|
303
|
+
self.assertEqual(self.lev.get_sim_score('abc', 'ac'), 1.0 - (1.0 / 3.0))
|
|
304
|
+
self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', 'abcdefg'), 1.0 - (6.0 / 13.0))
|
|
305
|
+
self.assertEqual(self.lev.get_sim_score('a', 'b'), 1.0 - (1.0 / 1.0))
|
|
306
|
+
self.assertEqual(self.lev.get_sim_score('ab', 'ac'), 1.0 - (1.0 / 2.0))
|
|
307
|
+
self.assertEqual(self.lev.get_sim_score('ac', 'bc'), 1.0 - (1.0 / 2.0))
|
|
308
|
+
self.assertEqual(self.lev.get_sim_score('abc', 'axc'), 1.0 - (1.0 / 3.0))
|
|
309
|
+
self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 1.0 - (6.0 / 13.0))
|
|
310
|
+
self.assertEqual(self.lev.get_sim_score('example', 'samples'), 1.0 - (3.0 / 7.0))
|
|
311
|
+
self.assertEqual(self.lev.get_sim_score('sturgeon', 'urgently'), 1.0 - (6.0 / 8.0))
|
|
312
|
+
self.assertEqual(self.lev.get_sim_score('levenshtein', 'frankenstein'), 1.0 - (6.0 / 12.0))
|
|
313
|
+
self.assertEqual(self.lev.get_sim_score('distance', 'difference'), 1.0 - (5.0 / 10.0))
|
|
314
|
+
self.assertEqual(self.lev.get_sim_score('java was neat', 'scala is great'), 1.0 - (7.0 / 14.0))
|
|
315
|
+
|
|
316
|
+
@raises(TypeError)
|
|
317
|
+
def test_invalid_input1_raw_score(self):
|
|
318
|
+
self.lev.get_raw_score('a', None)
|
|
319
|
+
|
|
320
|
+
@raises(TypeError)
|
|
321
|
+
def test_invalid_input2_raw_score(self):
|
|
322
|
+
self.lev.get_raw_score(None, 'b')
|
|
323
|
+
|
|
324
|
+
@raises(TypeError)
|
|
325
|
+
def test_invalid_input3_raw_score(self):
|
|
326
|
+
self.lev.get_raw_score(None, None)
|
|
327
|
+
|
|
328
|
+
@raises(TypeError)
|
|
329
|
+
def test_invalid_input4_raw_score(self):
|
|
330
|
+
self.lev.get_raw_score('MARHTA', 12.90)
|
|
331
|
+
|
|
332
|
+
@raises(TypeError)
|
|
333
|
+
def test_invalid_input5_raw_score(self):
|
|
334
|
+
self.lev.get_raw_score(12.90, 'MARTHA')
|
|
335
|
+
|
|
336
|
+
@raises(TypeError)
|
|
337
|
+
def test_invalid_input6_raw_score(self):
|
|
338
|
+
self.lev.get_raw_score(12.90, 12.90)
|
|
339
|
+
|
|
340
|
+
@raises(TypeError)
|
|
341
|
+
def test_invalid_input1_sim_score(self):
|
|
342
|
+
self.lev.get_sim_score('a', None)
|
|
343
|
+
|
|
344
|
+
@raises(TypeError)
|
|
345
|
+
def test_invalid_input2_sim_score(self):
|
|
346
|
+
self.lev.get_sim_score(None, 'b')
|
|
347
|
+
|
|
348
|
+
@raises(TypeError)
|
|
349
|
+
def test_invalid_input3_sim_score(self):
|
|
350
|
+
self.lev.get_sim_score(None, None)
|
|
351
|
+
|
|
352
|
+
@raises(TypeError)
|
|
353
|
+
def test_invalid_input4_sim_score(self):
|
|
354
|
+
self.lev.get_sim_score('MARHTA', 12.90)
|
|
355
|
+
|
|
356
|
+
@raises(TypeError)
|
|
357
|
+
def test_invalid_input5_sim_score(self):
|
|
358
|
+
self.lev.get_sim_score(12.90, 'MARTHA')
|
|
359
|
+
|
|
360
|
+
@raises(TypeError)
|
|
361
|
+
def test_invalid_input6_sim_score(self):
|
|
362
|
+
self.lev.get_sim_score(12.90, 12.90)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
class HammingDistanceTestCases(unittest.TestCase):
|
|
366
|
+
def setUp(self):
|
|
367
|
+
self.hd = HammingDistance()
|
|
368
|
+
|
|
369
|
+
def test_valid_input_raw_score(self):
|
|
370
|
+
self.assertEqual(self.hd.get_raw_score('-789', 'john'), 4)
|
|
371
|
+
self.assertEqual(self.hd.get_raw_score('a', '*'), 1)
|
|
372
|
+
self.assertEqual(self.hd.get_raw_score('b', 'a'), 1)
|
|
373
|
+
self.assertEqual(self.hd.get_raw_score('abc', 'p q'), 3)
|
|
374
|
+
self.assertEqual(self.hd.get_raw_score('karolin', 'kathrin'), 3)
|
|
375
|
+
self.assertEqual(self.hd.get_raw_score('KARI', 'kari'), 4)
|
|
376
|
+
self.assertEqual(self.hd.get_raw_score('', ''), 0)
|
|
377
|
+
|
|
378
|
+
def test_valid_input_sim_score(self):
|
|
379
|
+
self.assertEqual(self.hd.get_sim_score('-789', 'john'), 1.0 - (4.0 / 4.0))
|
|
380
|
+
self.assertEqual(self.hd.get_sim_score('a', '*'), 1.0 - (1.0 / 1.0))
|
|
381
|
+
self.assertEqual(self.hd.get_sim_score('b', 'a'), 1.0 - (1.0 / 1.0))
|
|
382
|
+
self.assertEqual(self.hd.get_sim_score('abc', 'p q'), 1.0 - (3.0 / 3.0))
|
|
383
|
+
self.assertEqual(self.hd.get_sim_score('karolin', 'kathrin'), 1.0 - (3.0 / 7.0))
|
|
384
|
+
self.assertEqual(self.hd.get_sim_score('KARI', 'kari'), 1.0 - (4.0 / 4.0))
|
|
385
|
+
self.assertEqual(self.hd.get_sim_score('', ''), 1.0)
|
|
386
|
+
|
|
387
|
+
def test_valid_input_compatibility_raw_score(self):
|
|
388
|
+
self.assertEqual(self.hd.get_raw_score(u'karolin', u'kathrin'), 3)
|
|
389
|
+
self.assertEqual(self.hd.get_raw_score(u'', u''), 0)
|
|
390
|
+
# str_1 = u'foo'.encode(encoding='UTF-8', errors='strict')
|
|
391
|
+
# str_2 = u'bar'.encode(encoding='UTF-8', errors='strict')
|
|
392
|
+
# self.assertEqual(self.hd.get_raw_score(str_1, str_2), 3) # check with Ali - python 3 returns type error
|
|
393
|
+
# self.assertEqual(self.hd.get_raw_score(str_1, str_1), 0) # check with Ali - python 3 returns type error
|
|
394
|
+
|
|
395
|
+
def test_valid_input_compatibility_sim_score(self):
|
|
396
|
+
self.assertEqual(self.hd.get_sim_score(u'karolin', u'kathrin'), 1.0 - (3.0 / 7.0))
|
|
397
|
+
self.assertEqual(self.hd.get_sim_score(u'', u''), 1.0)
|
|
398
|
+
|
|
399
|
+
@raises(TypeError)
|
|
400
|
+
def test_invalid_input1_raw_score(self):
|
|
401
|
+
self.hd.get_raw_score('a', None)
|
|
402
|
+
|
|
403
|
+
@raises(TypeError)
|
|
404
|
+
def test_invalid_input2_raw_score(self):
|
|
405
|
+
self.hd.get_raw_score(None, 'b')
|
|
406
|
+
|
|
407
|
+
@raises(TypeError)
|
|
408
|
+
def test_invalid_input3_raw_score(self):
|
|
409
|
+
self.hd.get_raw_score(None, None)
|
|
410
|
+
|
|
411
|
+
@raises(ValueError)
|
|
412
|
+
def test_invalid_input4_raw_score(self):
|
|
413
|
+
self.hd.get_raw_score('a', '')
|
|
414
|
+
|
|
415
|
+
@raises(ValueError)
|
|
416
|
+
def test_invalid_input5_raw_score(self):
|
|
417
|
+
self.hd.get_raw_score('', 'This is a long string')
|
|
418
|
+
|
|
419
|
+
@raises(ValueError)
|
|
420
|
+
def test_invalid_input6_raw_score(self):
|
|
421
|
+
self.hd.get_raw_score('ali', 'alex')
|
|
422
|
+
|
|
423
|
+
@raises(TypeError)
|
|
424
|
+
def test_invalid_input7_raw_score(self):
|
|
425
|
+
self.hd.get_raw_score('MA', 12)
|
|
426
|
+
|
|
427
|
+
@raises(TypeError)
|
|
428
|
+
def test_invalid_input8_raw_score(self):
|
|
429
|
+
self.hd.get_raw_score(12, 'MA')
|
|
430
|
+
|
|
431
|
+
@raises(TypeError)
|
|
432
|
+
def test_invalid_input9_raw_score(self):
|
|
433
|
+
self.hd.get_raw_score(12, 12)
|
|
434
|
+
|
|
435
|
+
@raises(TypeError)
|
|
436
|
+
def test_invalid_input1_sim_score(self):
|
|
437
|
+
self.hd.get_sim_score('a', None)
|
|
438
|
+
|
|
439
|
+
@raises(TypeError)
|
|
440
|
+
def test_invalid_input2_sim_score(self):
|
|
441
|
+
self.hd.get_sim_score(None, 'b')
|
|
442
|
+
|
|
443
|
+
@raises(TypeError)
|
|
444
|
+
def test_invalid_input3_sim_score(self):
|
|
445
|
+
self.hd.get_sim_score(None, None)
|
|
446
|
+
|
|
447
|
+
@raises(ValueError)
|
|
448
|
+
def test_invalid_input4_sim_score(self):
|
|
449
|
+
self.hd.get_sim_score('a', '')
|
|
450
|
+
|
|
451
|
+
@raises(ValueError)
|
|
452
|
+
def test_invalid_input5_sim_score(self):
|
|
453
|
+
self.hd.get_sim_score('', 'This is a long string')
|
|
454
|
+
|
|
455
|
+
@raises(ValueError)
|
|
456
|
+
def test_invalid_input6_sim_score(self):
|
|
457
|
+
self.hd.get_sim_score('ali', 'alex')
|
|
458
|
+
|
|
459
|
+
@raises(TypeError)
|
|
460
|
+
def test_invalid_input7_sim_score(self):
|
|
461
|
+
self.hd.get_sim_score('MA', 12)
|
|
462
|
+
|
|
463
|
+
@raises(TypeError)
|
|
464
|
+
def test_invalid_input8_sim_score(self):
|
|
465
|
+
self.hd.get_sim_score(12, 'MA')
|
|
466
|
+
|
|
467
|
+
@raises(TypeError)
|
|
468
|
+
def test_invalid_input9_sim_score(self):
|
|
469
|
+
self.hd.get_sim_score(12, 12)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class NeedlemanWunschTestCases(unittest.TestCase):
|
|
473
|
+
def setUp(self):
|
|
474
|
+
self.nw = NeedlemanWunsch()
|
|
475
|
+
self.nw_with_params1 = NeedlemanWunsch(0.0)
|
|
476
|
+
self.nw_with_params2 = NeedlemanWunsch(1.0,
|
|
477
|
+
sim_func=lambda s1, s2: (2 if s1 == s2 else -1))
|
|
478
|
+
self.sim_func = lambda s1, s2: (1 if s1 == s2 else -1)
|
|
479
|
+
self.nw_with_params3 = NeedlemanWunsch(gap_cost=0.5,
|
|
480
|
+
sim_func=self.sim_func)
|
|
481
|
+
|
|
482
|
+
def test_get_gap_cost(self):
|
|
483
|
+
self.assertEqual(self.nw_with_params3.get_gap_cost(), 0.5)
|
|
484
|
+
|
|
485
|
+
def test_get_sim_func(self):
|
|
486
|
+
self.assertEqual(self.nw_with_params3.get_sim_func(), self.sim_func)
|
|
487
|
+
|
|
488
|
+
def test_set_gap_cost(self):
|
|
489
|
+
nw = NeedlemanWunsch(gap_cost=0.5)
|
|
490
|
+
self.assertEqual(nw.get_gap_cost(), 0.5)
|
|
491
|
+
self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 2.0)
|
|
492
|
+
self.assertEqual(nw.set_gap_cost(0.7), True)
|
|
493
|
+
self.assertEqual(nw.get_gap_cost(), 0.7)
|
|
494
|
+
self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.6000000000000001)
|
|
495
|
+
|
|
496
|
+
def test_set_sim_func(self):
|
|
497
|
+
fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0))
|
|
498
|
+
fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1))
|
|
499
|
+
nw = NeedlemanWunsch(sim_func=fn1)
|
|
500
|
+
self.assertEqual(nw.get_sim_func(), fn1)
|
|
501
|
+
self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.0)
|
|
502
|
+
self.assertEqual(nw.set_sim_func(fn2), True)
|
|
503
|
+
self.assertEqual(nw.get_sim_func(), fn2)
|
|
504
|
+
self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 4.0)
|
|
505
|
+
|
|
506
|
+
def test_valid_input(self):
|
|
507
|
+
self.assertEqual(self.nw.get_raw_score('dva', 'deeva'), 1.0)
|
|
508
|
+
self.assertEqual(self.nw_with_params1.get_raw_score('dva', 'deeve'), 2.0)
|
|
509
|
+
self.assertEqual(self.nw_with_params2.get_raw_score('dva', 'deeve'), 1.0)
|
|
510
|
+
self.assertEqual(self.nw_with_params3.get_raw_score('GCATGCUA', 'GATTACA'),
|
|
511
|
+
2.5)
|
|
512
|
+
|
|
513
|
+
@raises(TypeError)
|
|
514
|
+
def test_invalid_input1_raw_score(self):
|
|
515
|
+
self.nw.get_raw_score('a', None)
|
|
516
|
+
|
|
517
|
+
@raises(TypeError)
|
|
518
|
+
def test_invalid_input2_raw_score(self):
|
|
519
|
+
self.nw.get_raw_score(None, 'b')
|
|
520
|
+
|
|
521
|
+
@raises(TypeError)
|
|
522
|
+
def test_invalid_input3_raw_score(self):
|
|
523
|
+
self.nw.get_raw_score(None, None)
|
|
524
|
+
|
|
525
|
+
@raises(TypeError)
|
|
526
|
+
def test_invalid_input4_raw_score(self):
|
|
527
|
+
self.nw.get_raw_score(['a'], 'b')
|
|
528
|
+
|
|
529
|
+
@raises(TypeError)
|
|
530
|
+
def test_invalid_input5_raw_score(self):
|
|
531
|
+
self.nw.get_raw_score('a', ['b'])
|
|
532
|
+
|
|
533
|
+
@raises(TypeError)
|
|
534
|
+
def test_invalid_input6_raw_score(self):
|
|
535
|
+
self.nw.get_raw_score(['a'], ['b'])
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class SmithWatermanTestCases(unittest.TestCase):
|
|
539
|
+
def setUp(self):
|
|
540
|
+
self.sw = SmithWaterman()
|
|
541
|
+
self.sw_with_params1 = SmithWaterman(2.2)
|
|
542
|
+
self.sw_with_params2 = SmithWaterman(1,
|
|
543
|
+
sim_func=lambda s1, s2: (2 if s1 == s2 else -1))
|
|
544
|
+
self.sw_with_params3 = SmithWaterman(gap_cost=1,
|
|
545
|
+
sim_func=lambda s1, s2: (int(1 if s1 == s2 else -1)))
|
|
546
|
+
self.sim_func = lambda s1, s2: (1.5 if s1 == s2 else 0.5)
|
|
547
|
+
self.sw_with_params4 = SmithWaterman(gap_cost=1.4,
|
|
548
|
+
sim_func=self.sim_func)
|
|
549
|
+
|
|
550
|
+
def test_get_gap_cost(self):
|
|
551
|
+
self.assertEqual(self.sw_with_params4.get_gap_cost(), 1.4)
|
|
552
|
+
|
|
553
|
+
def test_get_sim_func(self):
|
|
554
|
+
self.assertEqual(self.sw_with_params4.get_sim_func(), self.sim_func)
|
|
555
|
+
|
|
556
|
+
def test_set_gap_cost(self):
|
|
557
|
+
sw = SmithWaterman(gap_cost=0.3)
|
|
558
|
+
self.assertEqual(sw.get_gap_cost(), 0.3)
|
|
559
|
+
self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.3999999999999999)
|
|
560
|
+
self.assertEqual(sw.set_gap_cost(0.7), True)
|
|
561
|
+
self.assertEqual(sw.get_gap_cost(), 0.7)
|
|
562
|
+
self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0)
|
|
563
|
+
|
|
564
|
+
def test_set_sim_func(self):
|
|
565
|
+
fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0))
|
|
566
|
+
fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1))
|
|
567
|
+
sw = SmithWaterman(sim_func=fn1)
|
|
568
|
+
self.assertEqual(sw.get_sim_func(), fn1)
|
|
569
|
+
self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0)
|
|
570
|
+
self.assertEqual(sw.set_sim_func(fn2), True)
|
|
571
|
+
self.assertEqual(sw.get_sim_func(), fn2)
|
|
572
|
+
self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 4.0)
|
|
573
|
+
|
|
574
|
+
def test_valid_input(self):
|
|
575
|
+
self.assertEqual(self.sw.get_raw_score('cat', 'hat'), 2.0)
|
|
576
|
+
self.assertEqual(self.sw_with_params1.get_raw_score('dva', 'deeve'), 1.0)
|
|
577
|
+
self.assertEqual(self.sw_with_params2.get_raw_score('dva', 'deeve'), 2.0)
|
|
578
|
+
self.assertEqual(self.sw_with_params3.get_raw_score('GCATGCU', 'GATTACA'),
|
|
579
|
+
2.0)
|
|
580
|
+
self.assertEqual(self.sw_with_params4.get_raw_score('GCATAGCU', 'GATTACA'),
|
|
581
|
+
6.5)
|
|
582
|
+
|
|
583
|
+
@raises(TypeError)
|
|
584
|
+
def test_invalid_input1_raw_score(self):
|
|
585
|
+
self.sw.get_raw_score('a', None)
|
|
586
|
+
|
|
587
|
+
@raises(TypeError)
|
|
588
|
+
def test_invalid_input2_raw_score(self):
|
|
589
|
+
self.sw.get_raw_score(None, 'b')
|
|
590
|
+
|
|
591
|
+
@raises(TypeError)
|
|
592
|
+
def test_invalid_input3_raw_score(self):
|
|
593
|
+
self.sw.get_raw_score(None, None)
|
|
594
|
+
|
|
595
|
+
@raises(TypeError)
|
|
596
|
+
def test_invalid_input4_raw_score(self):
|
|
597
|
+
self.sw.get_raw_score('MARHTA', 12)
|
|
598
|
+
|
|
599
|
+
@raises(TypeError)
|
|
600
|
+
def test_invalid_input5_raw_score(self):
|
|
601
|
+
self.sw.get_raw_score(12, 'MARTHA')
|
|
602
|
+
|
|
603
|
+
@raises(TypeError)
|
|
604
|
+
def test_invalid_input6_raw_score(self):
|
|
605
|
+
self.sw.get_raw_score(12, 12)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
# ---------------------- token based similarity measures ----------------------
|
|
609
|
+
|
|
610
|
+
# ---------------------- set based similarity measures ----------------------
|
|
611
|
+
class OverlapCoefficientTestCases(unittest.TestCase):
|
|
612
|
+
def setUp(self):
|
|
613
|
+
self.oc = OverlapCoefficient()
|
|
614
|
+
|
|
615
|
+
def test_valid_input_raw_score(self):
|
|
616
|
+
self.assertEqual(self.oc.get_raw_score([], []), 1.0)
|
|
617
|
+
self.assertEqual(self.oc.get_raw_score(['data', 'science'], ['data']),
|
|
618
|
+
1.0 / min(2.0, 1.0))
|
|
619
|
+
self.assertEqual(self.oc.get_raw_score(['data', 'science'],
|
|
620
|
+
['science', 'good']), 1.0 / min(2.0, 3.0))
|
|
621
|
+
self.assertEqual(self.oc.get_raw_score([], ['data']), 0)
|
|
622
|
+
self.assertEqual(self.oc.get_raw_score(['data', 'data', 'science'],
|
|
623
|
+
['data', 'management']), 1.0 / min(3.0, 2.0))
|
|
624
|
+
def test_valid_input_raw_score_set_inp(self):
|
|
625
|
+
self.assertEqual(self.oc.get_raw_score(set(['data', 'science']), set(['data'])),
|
|
626
|
+
1.0 / min(2.0, 1.0))
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def test_valid_input_sim_score(self):
|
|
630
|
+
self.assertEqual(self.oc.get_sim_score([], []), 1.0)
|
|
631
|
+
self.assertEqual(self.oc.get_sim_score(['data', 'science'], ['data']),
|
|
632
|
+
1.0 / min(2.0, 1.0))
|
|
633
|
+
self.assertEqual(self.oc.get_sim_score(['data', 'science'],
|
|
634
|
+
['science', 'good']), 1.0 / min(2.0, 3.0))
|
|
635
|
+
self.assertEqual(self.oc.get_sim_score([], ['data']), 0)
|
|
636
|
+
self.assertEqual(self.oc.get_sim_score(['data', 'data', 'science'],
|
|
637
|
+
['data', 'management']), 1.0 / min(3.0, 2.0))
|
|
638
|
+
|
|
639
|
+
@raises(TypeError)
|
|
640
|
+
def test_invalid_input1_raw_score(self):
|
|
641
|
+
self.oc.get_raw_score(['a'], None)
|
|
642
|
+
|
|
643
|
+
@raises(TypeError)
|
|
644
|
+
def test_invalid_input2_raw_score(self):
|
|
645
|
+
self.oc.get_raw_score(None, ['b'])
|
|
646
|
+
|
|
647
|
+
@raises(TypeError)
|
|
648
|
+
def test_invalid_input3_raw_score(self):
|
|
649
|
+
self.oc.get_raw_score(None, None)
|
|
650
|
+
|
|
651
|
+
@raises(TypeError)
|
|
652
|
+
def test_invalid_input4_raw_score(self):
|
|
653
|
+
self.oc.get_raw_score(['MARHTA'], 'MARTHA')
|
|
654
|
+
|
|
655
|
+
@raises(TypeError)
|
|
656
|
+
def test_invalid_input5_raw_score(self):
|
|
657
|
+
self.oc.get_raw_score('MARHTA', ['MARTHA'])
|
|
658
|
+
|
|
659
|
+
@raises(TypeError)
|
|
660
|
+
def test_invalid_input6_raw_score(self):
|
|
661
|
+
self.oc.get_raw_score('MARTHA', 'MARTHA')
|
|
662
|
+
|
|
663
|
+
@raises(TypeError)
|
|
664
|
+
def test_invalid_input1_sim_score(self):
|
|
665
|
+
self.oc.get_sim_score(['a'], None)
|
|
666
|
+
|
|
667
|
+
@raises(TypeError)
|
|
668
|
+
def test_invalid_input2_sim_score(self):
|
|
669
|
+
self.oc.get_sim_score(None, ['b'])
|
|
670
|
+
|
|
671
|
+
@raises(TypeError)
|
|
672
|
+
def test_invalid_input3_sim_score(self):
|
|
673
|
+
self.oc.get_sim_score(None, None)
|
|
674
|
+
|
|
675
|
+
@raises(TypeError)
|
|
676
|
+
def test_invalid_input4_sim_score(self):
|
|
677
|
+
self.oc.get_sim_score(['MARHTA'], 'MARTHA')
|
|
678
|
+
|
|
679
|
+
@raises(TypeError)
|
|
680
|
+
def test_invalid_input5_sim_score(self):
|
|
681
|
+
self.oc.get_sim_score('MARHTA', ['MARTHA'])
|
|
682
|
+
|
|
683
|
+
@raises(TypeError)
|
|
684
|
+
def test_invalid_input6_sim_score(self):
|
|
685
|
+
self.oc.get_sim_score('MARTHA', 'MARTHA')
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
class DiceTestCases(unittest.TestCase):
|
|
689
|
+
def setUp(self):
|
|
690
|
+
self.dice = Dice()
|
|
691
|
+
|
|
692
|
+
def test_valid_input_raw_score(self):
|
|
693
|
+
self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['data']),
|
|
694
|
+
2 * 1.0 / 3.0)
|
|
695
|
+
self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['science', 'good']),
|
|
696
|
+
2 * 1.0 / 4.0)
|
|
697
|
+
self.assertEqual(self.dice.get_raw_score([], ['data']), 0)
|
|
698
|
+
self.assertEqual(self.dice.get_raw_score(['data', 'data', 'science'],
|
|
699
|
+
['data', 'management']), 2 * 1.0 / 4.0)
|
|
700
|
+
self.assertEqual(self.dice.get_raw_score(['data', 'management'],
|
|
701
|
+
['data', 'data', 'science']), 2 * 1.0 / 4.0)
|
|
702
|
+
self.assertEqual(self.dice.get_raw_score([], []), 1.0)
|
|
703
|
+
self.assertEqual(self.dice.get_raw_score(['a', 'b'], ['b', 'a']), 1.0)
|
|
704
|
+
self.assertEqual(self.dice.get_raw_score(set([]), set([])), 1.0)
|
|
705
|
+
self.assertEqual(self.dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}),
|
|
706
|
+
2 * 3.0 / 11.0)
|
|
707
|
+
|
|
708
|
+
def test_valid_input_sim_score(self):
|
|
709
|
+
self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['data']),
|
|
710
|
+
2 * 1.0 / 3.0)
|
|
711
|
+
self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['science', 'good']),
|
|
712
|
+
2 * 1.0 / 4.0)
|
|
713
|
+
self.assertEqual(self.dice.get_sim_score([], ['data']), 0)
|
|
714
|
+
self.assertEqual(self.dice.get_sim_score(['data', 'data', 'science'],
|
|
715
|
+
['data', 'management']), 2 * 1.0 / 4.0)
|
|
716
|
+
self.assertEqual(self.dice.get_sim_score(['data', 'management'],
|
|
717
|
+
['data', 'data', 'science']), 2 * 1.0 / 4.0)
|
|
718
|
+
self.assertEqual(self.dice.get_sim_score([], []), 1.0)
|
|
719
|
+
self.assertEqual(self.dice.get_sim_score(['a', 'b'], ['b', 'a']), 1.0)
|
|
720
|
+
self.assertEqual(self.dice.get_sim_score(set([]), set([])), 1.0)
|
|
721
|
+
self.assertEqual(self.dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}),
|
|
722
|
+
2 * 3.0 / 11.0)
|
|
723
|
+
|
|
724
|
+
@raises(TypeError)
|
|
725
|
+
def test_invalid_input1_raw_score(self):
|
|
726
|
+
self.dice.get_raw_score(1, 1)
|
|
727
|
+
|
|
728
|
+
@raises(TypeError)
|
|
729
|
+
def test_invalid_input2_raw_score(self):
|
|
730
|
+
self.dice.get_raw_score(['a'], None)
|
|
731
|
+
|
|
732
|
+
@raises(TypeError)
|
|
733
|
+
def test_invalid_input3_raw_score(self):
|
|
734
|
+
self.dice.get_raw_score(None, ['b'])
|
|
735
|
+
|
|
736
|
+
@raises(TypeError)
|
|
737
|
+
def test_invalid_input4_raw_score(self):
|
|
738
|
+
self.dice.get_raw_score(None, None)
|
|
739
|
+
|
|
740
|
+
@raises(TypeError)
|
|
741
|
+
def test_invalid_input5_raw_score(self):
|
|
742
|
+
self.dice.get_raw_score(None, 'MARHTA')
|
|
743
|
+
|
|
744
|
+
@raises(TypeError)
|
|
745
|
+
def test_invalid_input6_raw_score(self):
|
|
746
|
+
self.dice.get_raw_score('MARHTA', None)
|
|
747
|
+
|
|
748
|
+
@raises(TypeError)
|
|
749
|
+
def test_invalid_input7_raw_score(self):
|
|
750
|
+
self.dice.get_raw_score('MARHTA', 'MARTHA')
|
|
751
|
+
|
|
752
|
+
@raises(TypeError)
|
|
753
|
+
def test_invalid_input1_sim_score(self):
|
|
754
|
+
self.dice.get_sim_score(1, 1)
|
|
755
|
+
|
|
756
|
+
@raises(TypeError)
|
|
757
|
+
def test_invalid_input2_sim_score(self):
|
|
758
|
+
self.dice.get_sim_score(['a'], None)
|
|
759
|
+
|
|
760
|
+
@raises(TypeError)
|
|
761
|
+
def test_invalid_input3_sim_score(self):
|
|
762
|
+
self.dice.get_sim_score(None, ['b'])
|
|
763
|
+
|
|
764
|
+
@raises(TypeError)
|
|
765
|
+
def test_invalid_input4_sim_score(self):
|
|
766
|
+
self.dice.get_sim_score(None, None)
|
|
767
|
+
|
|
768
|
+
@raises(TypeError)
|
|
769
|
+
def test_invalid_input5_sim_score(self):
|
|
770
|
+
self.dice.get_sim_score(None, 'MARHTA')
|
|
771
|
+
|
|
772
|
+
@raises(TypeError)
|
|
773
|
+
def test_invalid_input6_sim_score(self):
|
|
774
|
+
self.dice.get_sim_score('MARHTA', None)
|
|
775
|
+
|
|
776
|
+
@raises(TypeError)
|
|
777
|
+
def test_invalid_input7_sim_score(self):
|
|
778
|
+
self.dice.get_sim_score('MARHTA', 'MARTHA')
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
class JaccardTestCases(unittest.TestCase):
|
|
782
|
+
def setUp(self):
|
|
783
|
+
self.jac = Jaccard()
|
|
784
|
+
|
|
785
|
+
def test_valid_input_raw_score(self):
|
|
786
|
+
self.assertEqual(self.jac.get_raw_score(['data', 'science'], ['data']),
|
|
787
|
+
1.0 / 2.0)
|
|
788
|
+
self.assertEqual(self.jac.get_raw_score(['data', 'science'],
|
|
789
|
+
['science', 'good']), 1.0 / 3.0)
|
|
790
|
+
self.assertEqual(self.jac.get_raw_score([], ['data']), 0)
|
|
791
|
+
self.assertEqual(self.jac.get_raw_score(['data', 'data', 'science'],
|
|
792
|
+
['data', 'management']), 1.0 / 3.0)
|
|
793
|
+
self.assertEqual(self.jac.get_raw_score(['data', 'management'],
|
|
794
|
+
['data', 'data', 'science']), 1.0 / 3.0)
|
|
795
|
+
self.assertEqual(self.jac.get_raw_score([], []), 1.0)
|
|
796
|
+
self.assertEqual(self.jac.get_raw_score(set([]), set([])), 1.0)
|
|
797
|
+
self.assertEqual(self.jac.get_raw_score({1, 1, 2, 3, 4},
|
|
798
|
+
{2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0)
|
|
799
|
+
|
|
800
|
+
def test_valid_input_sim_score(self):
|
|
801
|
+
self.assertEqual(self.jac.get_sim_score(['data', 'science'], ['data']),
|
|
802
|
+
1.0 / 2.0)
|
|
803
|
+
self.assertEqual(self.jac.get_sim_score(['data', 'science'],
|
|
804
|
+
['science', 'good']), 1.0 / 3.0)
|
|
805
|
+
self.assertEqual(self.jac.get_sim_score([], ['data']), 0)
|
|
806
|
+
self.assertEqual(self.jac.get_sim_score(['data', 'data', 'science'],
|
|
807
|
+
['data', 'management']), 1.0 / 3.0)
|
|
808
|
+
self.assertEqual(self.jac.get_sim_score(['data', 'management'],
|
|
809
|
+
['data', 'data', 'science']), 1.0 / 3.0)
|
|
810
|
+
self.assertEqual(self.jac.get_sim_score([], []), 1.0)
|
|
811
|
+
self.assertEqual(self.jac.get_sim_score(set([]), set([])), 1.0)
|
|
812
|
+
self.assertEqual(self.jac.get_sim_score({1, 1, 2, 3, 4},
|
|
813
|
+
{2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0)
|
|
814
|
+
|
|
815
|
+
@raises(TypeError)
|
|
816
|
+
def test_invalid_input1_raw_score(self):
|
|
817
|
+
self.jac.get_raw_score(1, 1)
|
|
818
|
+
|
|
819
|
+
@raises(TypeError)
|
|
820
|
+
def test_invalid_input2_raw_score(self):
|
|
821
|
+
self.jac.get_raw_score(['a'], None)
|
|
822
|
+
|
|
823
|
+
@raises(TypeError)
|
|
824
|
+
def test_invalid_input3_raw_score(self):
|
|
825
|
+
self.jac.get_raw_score(None, ['b'])
|
|
826
|
+
|
|
827
|
+
@raises(TypeError)
|
|
828
|
+
def test_invalid_input4_raw_score(self):
|
|
829
|
+
self.jac.get_raw_score(None, None)
|
|
830
|
+
|
|
831
|
+
@raises(TypeError)
|
|
832
|
+
def test_invalid_input5_raw_score(self):
|
|
833
|
+
self.jac.get_raw_score(['MARHTA'], 'MARTHA')
|
|
834
|
+
|
|
835
|
+
@raises(TypeError)
|
|
836
|
+
def test_invalid_input6_raw_score(self):
|
|
837
|
+
self.jac.get_raw_score('MARHTA', ['MARTHA'])
|
|
838
|
+
|
|
839
|
+
@raises(TypeError)
|
|
840
|
+
def test_invalid_input7_raw_score(self):
|
|
841
|
+
self.jac.get_raw_score('MARTHA', 'MARTHA')
|
|
842
|
+
|
|
843
|
+
@raises(TypeError)
|
|
844
|
+
def test_invalid_input1_sim_score(self):
|
|
845
|
+
self.jac.get_sim_score(1, 1)
|
|
846
|
+
|
|
847
|
+
@raises(TypeError)
|
|
848
|
+
def test_invalid_input2_sim_score(self):
|
|
849
|
+
self.jac.get_sim_score(['a'], None)
|
|
850
|
+
|
|
851
|
+
@raises(TypeError)
|
|
852
|
+
def test_invalid_input3_sim_score(self):
|
|
853
|
+
self.jac.get_sim_score(None, ['b'])
|
|
854
|
+
|
|
855
|
+
@raises(TypeError)
|
|
856
|
+
def test_invalid_input4_sim_score(self):
|
|
857
|
+
self.jac.get_sim_score(None, None)
|
|
858
|
+
|
|
859
|
+
@raises(TypeError)
|
|
860
|
+
def test_invalid_input5_sim_score(self):
|
|
861
|
+
self.jac.get_sim_score(['MARHTA'], 'MARTHA')
|
|
862
|
+
|
|
863
|
+
@raises(TypeError)
|
|
864
|
+
def test_invalid_input6_sim_score(self):
|
|
865
|
+
self.jac.get_sim_score('MARHTA', ['MARTHA'])
|
|
866
|
+
|
|
867
|
+
@raises(TypeError)
|
|
868
|
+
def test_invalid_input7_sim_score(self):
|
|
869
|
+
self.jac.get_sim_score('MARTHA', 'MARTHA')
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
class CosineTestCases(unittest.TestCase):
|
|
873
|
+
def setUp(self):
|
|
874
|
+
self.cos = Cosine()
|
|
875
|
+
|
|
876
|
+
def test_valid_input_raw_score(self):
|
|
877
|
+
self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1)))
|
|
878
|
+
self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['science', 'good']),
|
|
879
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
880
|
+
self.assertEqual(self.cos.get_raw_score([], ['data']), 0.0)
|
|
881
|
+
self.assertEqual(self.cos.get_raw_score(['data', 'data', 'science'], ['data', 'management']),
|
|
882
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
883
|
+
self.assertEqual(self.cos.get_raw_score(['data', 'management'], ['data', 'data', 'science']),
|
|
884
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
885
|
+
self.assertEqual(self.cos.get_raw_score([], []), 1.0)
|
|
886
|
+
self.assertEqual(self.cos.get_raw_score(set([]), set([])), 1.0)
|
|
887
|
+
self.assertEqual(self.cos.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}),
|
|
888
|
+
3.0 / (math.sqrt(4) * math.sqrt(7)))
|
|
889
|
+
|
|
890
|
+
def test_valid_input_sim_score(self):
|
|
891
|
+
self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1)))
|
|
892
|
+
self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['science', 'good']),
|
|
893
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
894
|
+
self.assertEqual(self.cos.get_sim_score([], ['data']), 0.0)
|
|
895
|
+
self.assertEqual(self.cos.get_sim_score(['data', 'data', 'science'], ['data', 'management']),
|
|
896
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
897
|
+
self.assertEqual(self.cos.get_sim_score(['data', 'management'], ['data', 'data', 'science']),
|
|
898
|
+
1.0 / (math.sqrt(2) * math.sqrt(2)))
|
|
899
|
+
self.assertEqual(self.cos.get_sim_score([], []), 1.0)
|
|
900
|
+
self.assertEqual(self.cos.get_sim_score(set([]), set([])), 1.0)
|
|
901
|
+
self.assertEqual(self.cos.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}),
|
|
902
|
+
3.0 / (math.sqrt(4) * math.sqrt(7)))
|
|
903
|
+
|
|
904
|
+
@raises(TypeError)
|
|
905
|
+
def test_invalid_input1_raw_score(self):
|
|
906
|
+
self.cos.get_raw_score(1, 1)
|
|
907
|
+
|
|
908
|
+
@raises(TypeError)
|
|
909
|
+
def test_invalid_input4_raw_score(self):
|
|
910
|
+
self.cos.get_raw_score(['a'], None)
|
|
911
|
+
|
|
912
|
+
@raises(TypeError)
|
|
913
|
+
def test_invalid_input2_raw_score(self):
|
|
914
|
+
self.cos.get_raw_score(None, ['b'])
|
|
915
|
+
|
|
916
|
+
@raises(TypeError)
|
|
917
|
+
def test_invalid_input3_raw_score(self):
|
|
918
|
+
self.cos.get_raw_score(None, None)
|
|
919
|
+
|
|
920
|
+
@raises(TypeError)
|
|
921
|
+
def test_invalid_input5_raw_score(self):
|
|
922
|
+
self.cos.get_raw_score(['MARHTA'], 'MARTHA')
|
|
923
|
+
|
|
924
|
+
@raises(TypeError)
|
|
925
|
+
def test_invalid_input6_raw_score(self):
|
|
926
|
+
self.cos.get_raw_score('MARHTA', ['MARTHA'])
|
|
927
|
+
|
|
928
|
+
@raises(TypeError)
|
|
929
|
+
def test_invalid_input7_raw_score(self):
|
|
930
|
+
self.cos.get_raw_score('MARTHA', 'MARTHA')
|
|
931
|
+
|
|
932
|
+
@raises(TypeError)
|
|
933
|
+
def test_invalid_input1_sim_score(self):
|
|
934
|
+
self.cos.get_sim_score(1, 1)
|
|
935
|
+
|
|
936
|
+
@raises(TypeError)
|
|
937
|
+
def test_invalid_input4_sim_score(self):
|
|
938
|
+
self.cos.get_sim_score(['a'], None)
|
|
939
|
+
|
|
940
|
+
@raises(TypeError)
|
|
941
|
+
def test_invalid_input2_sim_score(self):
|
|
942
|
+
self.cos.get_sim_score(None, ['b'])
|
|
943
|
+
|
|
944
|
+
@raises(TypeError)
|
|
945
|
+
def test_invalid_input3_sim_score(self):
|
|
946
|
+
self.cos.get_sim_score(None, None)
|
|
947
|
+
|
|
948
|
+
@raises(TypeError)
|
|
949
|
+
def test_invalid_input5_sim_score(self):
|
|
950
|
+
self.cos.get_sim_score(['MARHTA'], 'MARTHA')
|
|
951
|
+
|
|
952
|
+
@raises(TypeError)
|
|
953
|
+
def test_invalid_input6_sim_score(self):
|
|
954
|
+
self.cos.get_sim_score('MARHTA', ['MARTHA'])
|
|
955
|
+
|
|
956
|
+
@raises(TypeError)
|
|
957
|
+
def test_invalid_input7_sim_score(self):
|
|
958
|
+
self.cos.get_sim_score('MARTHA', 'MARTHA')
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
class TfidfTestCases(unittest.TestCase):
|
|
962
|
+
def setUp(self):
|
|
963
|
+
self.tfidf = TfIdf()
|
|
964
|
+
self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']]
|
|
965
|
+
self.tfidf_with_params1 = TfIdf(self.corpus, True)
|
|
966
|
+
self.tfidf_with_params2 = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
|
|
967
|
+
self.tfidf_with_params3 = TfIdf([['x', 'y'], ['w'], ['q']])
|
|
968
|
+
|
|
969
|
+
def test_get_corpus_list(self):
|
|
970
|
+
self.assertEqual(self.tfidf_with_params1.get_corpus_list(), self.corpus)
|
|
971
|
+
|
|
972
|
+
def test_get_dampen(self):
|
|
973
|
+
self.assertEqual(self.tfidf_with_params1.get_dampen(), True)
|
|
974
|
+
|
|
975
|
+
def test_set_corpus_list(self):
|
|
976
|
+
corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']]
|
|
977
|
+
corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']]
|
|
978
|
+
tfidf = TfIdf(corpus_list=corpus1)
|
|
979
|
+
self.assertEqual(tfidf.get_corpus_list(), corpus1)
|
|
980
|
+
self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7999999999999999)
|
|
981
|
+
self.assertEqual(tfidf.set_corpus_list(corpus2), True)
|
|
982
|
+
self.assertEqual(tfidf.get_corpus_list(), corpus2)
|
|
983
|
+
self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.8320502943378437)
|
|
984
|
+
|
|
985
|
+
def test_set_dampen(self):
|
|
986
|
+
tfidf = TfIdf(self.corpus, dampen=False)
|
|
987
|
+
self.assertEqual(tfidf.get_dampen(), False)
|
|
988
|
+
self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7999999999999999)
|
|
989
|
+
self.assertEqual(tfidf.set_dampen(True), True)
|
|
990
|
+
self.assertEqual(tfidf.get_dampen(), True)
|
|
991
|
+
self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5495722661728765)
|
|
992
|
+
|
|
993
|
+
def test_valid_input_raw_score(self):
|
|
994
|
+
self.assertEqual(self.tfidf_with_params1.get_raw_score(['a', 'b', 'a'], ['a', 'c']),
|
|
995
|
+
0.11166746710505392)
|
|
996
|
+
self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a', 'c']),
|
|
997
|
+
0.17541160386140586)
|
|
998
|
+
self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a']),
|
|
999
|
+
0.5547001962252291)
|
|
1000
|
+
self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7071067811865475)
|
|
1001
|
+
self.assertEqual(self.tfidf_with_params3.get_raw_score(['a', 'b', 'a'], ['a']), 0.0)
|
|
1002
|
+
self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7071067811865475)
|
|
1003
|
+
self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0)
|
|
1004
|
+
self.assertEqual(self.tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0)
|
|
1005
|
+
|
|
1006
|
+
def test_valid_input_sim_score(self):
|
|
1007
|
+
self.assertEqual(self.tfidf_with_params1.get_sim_score(['a', 'b', 'a'], ['a', 'c']),
|
|
1008
|
+
0.11166746710505392)
|
|
1009
|
+
self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a', 'c']),
|
|
1010
|
+
0.17541160386140586)
|
|
1011
|
+
self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a']),
|
|
1012
|
+
0.5547001962252291)
|
|
1013
|
+
self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.7071067811865475)
|
|
1014
|
+
self.assertEqual(self.tfidf_with_params3.get_sim_score(['a', 'b', 'a'], ['a']), 0.0)
|
|
1015
|
+
self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.7071067811865475)
|
|
1016
|
+
self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0)
|
|
1017
|
+
self.assertEqual(self.tfidf.get_sim_score([], ['a', 'b', 'a']), 0.0)
|
|
1018
|
+
|
|
1019
|
+
@raises(TypeError)
|
|
1020
|
+
def test_invalid_input1_raw_score(self):
|
|
1021
|
+
self.tfidf.get_raw_score(1, 1)
|
|
1022
|
+
|
|
1023
|
+
@raises(TypeError)
|
|
1024
|
+
def test_invalid_input4_raw_score(self):
|
|
1025
|
+
self.tfidf.get_raw_score(['a'], None)
|
|
1026
|
+
|
|
1027
|
+
@raises(TypeError)
|
|
1028
|
+
def test_invalid_input2_raw_score(self):
|
|
1029
|
+
self.tfidf.get_raw_score(None, ['b'])
|
|
1030
|
+
|
|
1031
|
+
@raises(TypeError)
|
|
1032
|
+
def test_invalid_input3_raw_score(self):
|
|
1033
|
+
self.tfidf.get_raw_score(None, None)
|
|
1034
|
+
|
|
1035
|
+
@raises(TypeError)
|
|
1036
|
+
def test_invalid_input5_raw_score(self):
|
|
1037
|
+
self.tfidf.get_raw_score(['MARHTA'], 'MARTHA')
|
|
1038
|
+
|
|
1039
|
+
@raises(TypeError)
|
|
1040
|
+
def test_invalid_input6_raw_score(self):
|
|
1041
|
+
self.tfidf.get_raw_score('MARHTA', ['MARTHA'])
|
|
1042
|
+
|
|
1043
|
+
@raises(TypeError)
|
|
1044
|
+
def test_invalid_input7_raw_score(self):
|
|
1045
|
+
self.tfidf.get_raw_score('MARTHA', 'MARTHA')
|
|
1046
|
+
|
|
1047
|
+
@raises(TypeError)
|
|
1048
|
+
def test_invalid_input1_sim_score(self):
|
|
1049
|
+
self.tfidf.get_sim_score(1, 1)
|
|
1050
|
+
|
|
1051
|
+
@raises(TypeError)
|
|
1052
|
+
def test_invalid_input4_sim_score(self):
|
|
1053
|
+
self.tfidf.get_sim_score(['a'], None)
|
|
1054
|
+
|
|
1055
|
+
@raises(TypeError)
|
|
1056
|
+
def test_invalid_input2_sim_score(self):
|
|
1057
|
+
self.tfidf.get_sim_score(None, ['b'])
|
|
1058
|
+
|
|
1059
|
+
@raises(TypeError)
|
|
1060
|
+
def test_invalid_input3_sim_score(self):
|
|
1061
|
+
self.tfidf.get_sim_score(None, None)
|
|
1062
|
+
|
|
1063
|
+
@raises(TypeError)
|
|
1064
|
+
def test_invalid_input5_sim_score(self):
|
|
1065
|
+
self.tfidf.get_sim_score(['MARHTA'], 'MARTHA')
|
|
1066
|
+
|
|
1067
|
+
@raises(TypeError)
|
|
1068
|
+
def test_invalid_input6_sim_score(self):
|
|
1069
|
+
self.tfidf.get_sim_score('MARHTA', ['MARTHA'])
|
|
1070
|
+
|
|
1071
|
+
@raises(TypeError)
|
|
1072
|
+
def test_invalid_input7_sim_score(self):
|
|
1073
|
+
self.tfidf.get_sim_score('MARTHA', 'MARTHA')
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
# ---------------------- hybrid similarity measure ----------------------
|
|
1077
|
+
|
|
1078
|
+
class Soft_TfidfTestCases(unittest.TestCase):
|
|
1079
|
+
def setUp(self):
|
|
1080
|
+
self.soft_tfidf = SoftTfIdf()
|
|
1081
|
+
self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a']]
|
|
1082
|
+
self.soft_tfidf_with_params1 = SoftTfIdf(self.corpus,
|
|
1083
|
+
sim_func=Jaro().get_raw_score,
|
|
1084
|
+
threshold=0.8)
|
|
1085
|
+
self.soft_tfidf_with_params2 = SoftTfIdf(self.corpus,
|
|
1086
|
+
threshold=0.9)
|
|
1087
|
+
self.soft_tfidf_with_params3 = SoftTfIdf([['x', 'y'], ['w'], ['q']])
|
|
1088
|
+
self.affine_fn = Affine().get_raw_score
|
|
1089
|
+
self.soft_tfidf_with_params4 = SoftTfIdf(sim_func=self.affine_fn, threshold=0.6)
|
|
1090
|
+
|
|
1091
|
+
def test_get_corpus_list(self):
|
|
1092
|
+
self.assertEqual(self.soft_tfidf_with_params1.get_corpus_list(), self.corpus)
|
|
1093
|
+
|
|
1094
|
+
def test_get_sim_func(self):
|
|
1095
|
+
self.assertEqual(self.soft_tfidf_with_params4.get_sim_func(), self.affine_fn)
|
|
1096
|
+
|
|
1097
|
+
def test_get_threshold(self):
|
|
1098
|
+
self.assertEqual(self.soft_tfidf_with_params4.get_threshold(), 0.6)
|
|
1099
|
+
|
|
1100
|
+
def test_set_corpus_list(self):
|
|
1101
|
+
corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']]
|
|
1102
|
+
corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']]
|
|
1103
|
+
soft_tfidf = SoftTfIdf(corpus_list=corpus1)
|
|
1104
|
+
self.assertEqual(soft_tfidf.get_corpus_list(), corpus1)
|
|
1105
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']),
|
|
1106
|
+
0.7999999999999999)
|
|
1107
|
+
self.assertEqual(soft_tfidf.set_corpus_list(corpus2), True)
|
|
1108
|
+
self.assertEqual(soft_tfidf.get_corpus_list(), corpus2)
|
|
1109
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']),
|
|
1110
|
+
0.8320502943378437)
|
|
1111
|
+
|
|
1112
|
+
def test_set_threshold(self):
|
|
1113
|
+
soft_tfidf = SoftTfIdf(threshold=0.5)
|
|
1114
|
+
self.assertEqual(soft_tfidf.get_threshold(), 0.5)
|
|
1115
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699)
|
|
1116
|
+
self.assertEqual(soft_tfidf.set_threshold(0.7), True)
|
|
1117
|
+
self.assertEqual(soft_tfidf.get_threshold(), 0.7)
|
|
1118
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.4811252243246882)
|
|
1119
|
+
|
|
1120
|
+
def test_set_sim_func(self):
|
|
1121
|
+
fn1 = JaroWinkler().get_raw_score
|
|
1122
|
+
fn2 = Jaro().get_raw_score
|
|
1123
|
+
soft_tfidf = SoftTfIdf(sim_func=fn1)
|
|
1124
|
+
self.assertEqual(soft_tfidf.get_sim_func(), fn1)
|
|
1125
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8612141515411919)
|
|
1126
|
+
self.assertEqual(soft_tfidf.set_sim_func(fn2), True)
|
|
1127
|
+
self.assertEqual(soft_tfidf.get_sim_func(), fn2)
|
|
1128
|
+
self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699)
|
|
1129
|
+
|
|
1130
|
+
def test_valid_input_raw_score(self):
|
|
1131
|
+
self.assertEqual(self.soft_tfidf_with_params1.get_raw_score(
|
|
1132
|
+
['a', 'b', 'a'], ['a', 'c']), 0.17541160386140586)
|
|
1133
|
+
self.assertEqual(self.soft_tfidf_with_params2.get_raw_score(
|
|
1134
|
+
['a', 'b', 'a'], ['a']), 0.5547001962252291)
|
|
1135
|
+
self.assertEqual(self.soft_tfidf_with_params3.get_raw_score(
|
|
1136
|
+
['a', 'b', 'a'], ['a']), 0.0)
|
|
1137
|
+
self.assertEqual(self.soft_tfidf_with_params4.get_raw_score(
|
|
1138
|
+
['aa', 'bb', 'a'], ['ab', 'ba']),
|
|
1139
|
+
0.81649658092772592)
|
|
1140
|
+
self.assertEqual(self.soft_tfidf.get_raw_score(
|
|
1141
|
+
['a', 'b', 'a'], ['a', 'b', 'a']), 1.0)
|
|
1142
|
+
self.assertEqual(self.soft_tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0)
|
|
1143
|
+
|
|
1144
|
+
@raises(TypeError)
|
|
1145
|
+
def test_invalid_input1_raw_score(self):
|
|
1146
|
+
self.soft_tfidf.get_raw_score(1, 1)
|
|
1147
|
+
|
|
1148
|
+
@raises(TypeError)
|
|
1149
|
+
def test_invalid_input4_raw_score(self):
|
|
1150
|
+
self.soft_tfidf.get_raw_score(['a'], None)
|
|
1151
|
+
|
|
1152
|
+
@raises(TypeError)
|
|
1153
|
+
def test_invalid_input2_raw_score(self):
|
|
1154
|
+
self.soft_tfidf.get_raw_score(None, ['b'])
|
|
1155
|
+
|
|
1156
|
+
@raises(TypeError)
|
|
1157
|
+
def test_invalid_input3_raw_score(self):
|
|
1158
|
+
self.soft_tfidf.get_raw_score(None, None)
|
|
1159
|
+
|
|
1160
|
+
@raises(TypeError)
|
|
1161
|
+
def test_invalid_input5_raw_score(self):
|
|
1162
|
+
self.soft_tfidf.get_raw_score(['MARHTA'], 'MARTHA')
|
|
1163
|
+
|
|
1164
|
+
@raises(TypeError)
|
|
1165
|
+
def test_invalid_input6_raw_score(self):
|
|
1166
|
+
self.soft_tfidf.get_raw_score('MARHTA', ['MARTHA'])
|
|
1167
|
+
|
|
1168
|
+
@raises(TypeError)
|
|
1169
|
+
def test_invalid_input7_raw_score(self):
|
|
1170
|
+
self.soft_tfidf.get_raw_score('MARTHA', 'MARTHA')
|
|
1171
|
+
|
|
1172
|
+
|
|
1173
|
+
class MongeElkanTestCases(unittest.TestCase):
|
|
1174
|
+
def setUp(self):
|
|
1175
|
+
self.me = MongeElkan()
|
|
1176
|
+
self.me_with_nw = MongeElkan(NeedlemanWunsch().get_raw_score)
|
|
1177
|
+
self.affine_fn = Affine().get_raw_score
|
|
1178
|
+
self.me_with_affine = MongeElkan(self.affine_fn)
|
|
1179
|
+
|
|
1180
|
+
def test_get_sim_func(self):
|
|
1181
|
+
self.assertEqual(self.me_with_affine.get_sim_func(), self.affine_fn)
|
|
1182
|
+
|
|
1183
|
+
def test_set_sim_func(self):
|
|
1184
|
+
fn1 = JaroWinkler().get_raw_score
|
|
1185
|
+
fn2 = NeedlemanWunsch().get_raw_score
|
|
1186
|
+
me = MongeElkan(sim_func=fn1)
|
|
1187
|
+
self.assertEqual(me.get_sim_func(), fn1)
|
|
1188
|
+
self.assertAlmostEqual(me.get_raw_score(
|
|
1189
|
+
['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
|
|
1190
|
+
['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']),
|
|
1191
|
+
0.8364448051948052)
|
|
1192
|
+
self.assertEqual(me.set_sim_func(fn2), True)
|
|
1193
|
+
self.assertEqual(me.get_sim_func(), fn2)
|
|
1194
|
+
self.assertAlmostEqual(me.get_raw_score(
|
|
1195
|
+
['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
|
|
1196
|
+
['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']),
|
|
1197
|
+
2.0)
|
|
1198
|
+
|
|
1199
|
+
def test_valid_input(self):
|
|
1200
|
+
self.assertEqual(self.me.get_raw_score([''], ['']), 1.0) # need to check this
|
|
1201
|
+
|
|
1202
|
+
self.assertEqual(self.me.get_raw_score([''], ['a']), 0.0)
|
|
1203
|
+
self.assertEqual(self.me.get_raw_score(['a'], ['a']), 1.0)
|
|
1204
|
+
|
|
1205
|
+
self.assertEqual(self.me.get_raw_score(['Niall'], ['Neal']), 0.8049999999999999)
|
|
1206
|
+
self.assertEqual(self.me.get_raw_score(['Niall'], ['Njall']), 0.88)
|
|
1207
|
+
self.assertEqual(self.me.get_raw_score(
|
|
1208
|
+
['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
|
|
1209
|
+
['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']),
|
|
1210
|
+
0.8364448051948052)
|
|
1211
|
+
self.assertEqual(self.me_with_nw.get_raw_score(
|
|
1212
|
+
['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
|
|
1213
|
+
['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']),
|
|
1214
|
+
2.0)
|
|
1215
|
+
self.assertEqual(self.me_with_affine.get_raw_score(
|
|
1216
|
+
['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
|
|
1217
|
+
['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']),
|
|
1218
|
+
2.25)
|
|
1219
|
+
self.assertEqual(self.me.get_raw_score(['Niall'], ['Niel']), 0.8266666666666667)
|
|
1220
|
+
self.assertEqual(self.me.get_raw_score(['Niall'], ['Nigel']), 0.7866666666666667)
|
|
1221
|
+
self.assertEqual(self.me.get_raw_score([], ['Nigel']), 0.0)
|
|
1222
|
+
|
|
1223
|
+
@raises(TypeError)
|
|
1224
|
+
def test_invalid_input1_raw_score(self):
|
|
1225
|
+
self.me.get_raw_score(1, 1)
|
|
1226
|
+
|
|
1227
|
+
@raises(TypeError)
|
|
1228
|
+
def test_invalid_input2_raw_score(self):
|
|
1229
|
+
self.me.get_raw_score(None, ['b'])
|
|
1230
|
+
|
|
1231
|
+
@raises(TypeError)
|
|
1232
|
+
def test_invalid_input3_raw_score(self):
|
|
1233
|
+
self.me.get_raw_score(None, None)
|
|
1234
|
+
|
|
1235
|
+
@raises(TypeError)
|
|
1236
|
+
def test_invalid_input4_raw_score(self):
|
|
1237
|
+
self.me.get_raw_score("temp", "temp")
|
|
1238
|
+
|
|
1239
|
+
@raises(TypeError)
|
|
1240
|
+
def test_invalid_input5_raw_score(self):
|
|
1241
|
+
self.me.get_raw_score(['temp'], 'temp')
|
|
1242
|
+
|
|
1243
|
+
@raises(TypeError)
|
|
1244
|
+
def test_invalid_input6_raw_score(self):
|
|
1245
|
+
self.me.get_raw_score(['a'], None)
|
|
1246
|
+
|
|
1247
|
+
@raises(TypeError)
|
|
1248
|
+
def test_invalid_input7_raw_score(self):
|
|
1249
|
+
self.me.get_raw_score('temp', ['temp'])
|