py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,6 @@
1
+ The following individuals have contributed code, documentation, or expertise to py_stringmatching:
2
+
3
+ * `Ali Hitawala <https://github.com/alihitawala>`_
4
+ * `Paul Suganthan G. C. <https://github.com/paulgc>`_
5
+ * `Pradap Konda <https://github.com/kvpradap>`_
6
+ * `AnHai Doan <https://github.com/anhaidgroup>`_
@@ -0,0 +1,6 @@
1
+ v0.1.0 - 06/14/2016
2
+ * Initial release.
3
+ * Contains 5 tokenizers - Alphabetic tokenizer, Alphanumeric tokenizer, Delimiter tokenizer, Qgram tokenizer and
4
+ Whitespace tokenizer.
5
+ * Contains 14 similarity measures - Affine, Cosine, Dice, Hamming distance, Jaccard, Jaro, Jaro-Winkler,
6
+ Levenshtein, Monge-Elkan, Needleman-Wunsch, Overlap coefficient, Smith-Waterman, Soft TF-IDF, and TF-IDF.
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2016, anhaidgroup
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of py_stringmatching nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,30 @@
1
+ Copyright (c) 2005-2016, NumPy Developers.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ * Redistributions in binary form must reproduce the above
12
+ copyright notice, this list of conditions and the following
13
+ disclaimer in the documentation and/or other materials provided
14
+ with the distribution.
15
+
16
+ * Neither the name of the NumPy Developers nor the names of any
17
+ contributors may be used to endorse or promote products derived
18
+ from this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010-2016 Benjamin Peterson
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include AUTHORS.rst
2
+ include README.rst
3
+ include CHANGES.txt
4
+ include requirements.txt
5
+ include LICENSE
6
+ recursive-include LICENSES *
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 1.1
2
+ Name: py_stringmatching
3
+ Version: 0.1.0
4
+ Summary: Python library for string matching.
5
+ Home-page: https://sites.google.com/site/anhaidgroup/projects/py_stringmatching
6
+ Author: UW Magellan Team
7
+ Author-email: uwmagellan@gmail.com
8
+ License: BSD
9
+ Description: py_stringmatching
10
+ =================
11
+
12
+ py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
13
+ such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
14
+
15
+ Important links
16
+ ===============
17
+
18
+ * Repository: https://github.com/anhaidgroup/py_stringmatching
19
+ * Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
20
+ * Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
21
+ * Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
22
+ * Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
23
+
24
+ Dependencies
25
+ ============
26
+
27
+ py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
28
+
29
+ The required dependencies to build the package are NumPy >= 1.7.0,
30
+ Six and a working C/C++ compiler. For the development version, you will also require Cython.
31
+
32
+ Platforms
33
+ =========
34
+
35
+ py_stringmatching has been tested on Linux, OSX and Windows.
36
+
37
+ Platform: UNKNOWN
38
+ Classifier: Development Status :: 4 - Beta
39
+ Classifier: Environment :: Console
40
+ Classifier: Intended Audience :: Developers
41
+ Classifier: Intended Audience :: Science/Research
42
+ Classifier: Intended Audience :: Education
43
+ Classifier: License :: OSI Approved :: BSD License
44
+ Classifier: Operating System :: POSIX
45
+ Classifier: Operating System :: Unix
46
+ Classifier: Operating System :: MacOS
47
+ Classifier: Operating System :: Microsoft :: Windows
48
+ Classifier: Programming Language :: Python
49
+ Classifier: Programming Language :: Python :: 2
50
+ Classifier: Programming Language :: Python :: 3
51
+ Classifier: Programming Language :: Python :: 2.7
52
+ Classifier: Programming Language :: Python :: 3.3
53
+ Classifier: Programming Language :: Python :: 3.4
54
+ Classifier: Programming Language :: Python :: 3.5
55
+ Classifier: Topic :: Scientific/Engineering
56
+ Classifier: Topic :: Utilities
57
+ Classifier: Topic :: Software Development :: Libraries
@@ -0,0 +1,27 @@
1
+ py_stringmatching
2
+ =================
3
+
4
+ py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
5
+ such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
6
+
7
+ Important links
8
+ ===============
9
+
10
+ * Repository: https://github.com/anhaidgroup/py_stringmatching
11
+ * Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
12
+ * Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
13
+ * Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
14
+ * Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
15
+
16
+ Dependencies
17
+ ============
18
+
19
+ py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
20
+
21
+ The required dependencies to build the package are NumPy >= 1.7.0,
22
+ Six and a working C/C++ compiler. For the development version, you will also require Cython.
23
+
24
+ Platforms
25
+ =========
26
+
27
+ py_stringmatching has been tested on Linux, OSX and Windows.
@@ -0,0 +1,25 @@
1
+ __version__ = "0.1.0"
2
+
3
+ # Import tokenizers
4
+ from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer
5
+ from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer
6
+ from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
7
+ from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
8
+ from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer
9
+
10
+ # Import similarity measures
11
+ from py_stringmatching.similarity_measure.affine import Affine
12
+ from py_stringmatching.similarity_measure.cosine import Cosine
13
+ from py_stringmatching.similarity_measure.dice import Dice
14
+ from py_stringmatching.similarity_measure.hamming_distance import HammingDistance
15
+ from py_stringmatching.similarity_measure.jaccard import Jaccard
16
+ from py_stringmatching.similarity_measure.jaro import Jaro
17
+ from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
18
+ from py_stringmatching.similarity_measure.levenshtein import Levenshtein
19
+ from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
20
+ from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
21
+ from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
22
+ from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
23
+ from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
24
+ from py_stringmatching.similarity_measure.tfidf import TfIdf
25
+
@@ -0,0 +1,155 @@
1
+ import numpy as np
2
+
3
+ from py_stringmatching import utils
4
+ from six.moves import xrange
5
+ from py_stringmatching.similarity_measure.sequence_similarity_measure import \
6
+ SequenceSimilarityMeasure
7
+
8
+
9
+ def sim_ident(char1, char2):
10
+ return int(char1 == char2)
11
+
12
+
13
+ class Affine(SequenceSimilarityMeasure):
14
+ """Returns the affine gap score between two strings.
15
+
16
+ The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
17
+ gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration").
18
+
19
+ Args:
20
+ gap_start (float): Cost for the gap at the start (defaults to 1).
21
+ gap_continuation (float): Cost for the gap continuation (defaults to 0.5).
22
+ sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults
23
+ to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise).
24
+
25
+ Attributes:
26
+ gap_start (float): An attribute to store the gap cost at the start.
27
+ gap_continuation (float): An attribute to store the gap continuation cost.
28
+ sim_func (function): An attribute to store the similarity function.
29
+ """
30
+
31
+ def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=sim_ident):
32
+ self.gap_start = gap_start
33
+ self.gap_continuation = gap_continuation
34
+ self.sim_func = sim_func
35
+ super(Affine, self).__init__()
36
+
37
+ def get_raw_score(self, string1, string2):
38
+ """Computes the affine gap score between two strings. This score can be outside the range [0,1].
39
+
40
+ Args:
41
+ string1,string2 (str) : Input strings.
42
+
43
+ Returns:
44
+ Affine gap score betwen the two input strings (float).
45
+
46
+ Raises:
47
+ TypeError : If the inputs are not strings or if one of the inputs is None.
48
+
49
+ Examples:
50
+ >>> aff = Affine()
51
+ >>> aff.get_raw_score('dva', 'deeva')
52
+ 1.5
53
+ >>> aff = Affine(gap_start=2, gap_continuation=0.5)
54
+ >>> aff.get_raw_score('dva', 'deeve')
55
+ -0.5
56
+ >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
57
+ >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
58
+ 4.4
59
+ """
60
+ # input validations
61
+ utils.sim_check_for_none(string1, string2)
62
+ utils.tok_check_for_string_input(string1, string2)
63
+
64
+ # if one of the strings is empty return 0
65
+ if utils.sim_check_for_empty(string1, string2):
66
+ return 0
67
+
68
+ gap_start = -self.gap_start
69
+ gap_continuation = -self.gap_continuation
70
+ m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
71
+ x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
72
+ y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
73
+
74
+ # DP initialization
75
+ for i in xrange(1, len(string1) + 1):
76
+ m[i][0] = -float("inf")
77
+ x[i][0] = gap_start + (i - 1) * gap_continuation
78
+ y[i][0] = -float("inf")
79
+
80
+ # DP initialization
81
+ for j in xrange(1, len(string2) + 1):
82
+ m[0][j] = -float("inf")
83
+ x[0][j] = -float("inf")
84
+ y[0][j] = gap_start + (j - 1) * gap_continuation
85
+
86
+ # affine gap calculation using DP
87
+ for i in xrange(1, len(string1) + 1):
88
+ for j in xrange(1, len(string2) + 1):
89
+ # best score between x_1....x_i and y_1....y_j
90
+ # given that x_i is aligned to y_j
91
+ m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) +
92
+ max(m[i - 1][j - 1], x[i - 1][j - 1],
93
+ y[i - 1][j - 1]))
94
+
95
+ # the best score given that x_i is aligned to a gap
96
+ x[i][j] = max(gap_start + m[i - 1][j],
97
+ gap_continuation + x[i - 1][j])
98
+
99
+ # the best score given that y_j is aligned to a gap
100
+ y[i][j] = max(gap_start + m[i][j - 1],
101
+ gap_continuation + y[i][j - 1])
102
+
103
+ return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)],
104
+ y[len(string1)][len(string2)])
105
+
106
+ def get_gap_start(self):
107
+ """Get gap start cost.
108
+
109
+ Returns:
110
+ gap start cost (float).
111
+ """
112
+ return self.gap_start
113
+
114
+ def get_gap_continuation(self):
115
+ """Get gap continuation cost.
116
+
117
+ Returns:
118
+ gap continuation cost (float).
119
+ """
120
+ return self.gap_continuation
121
+
122
+ def get_sim_func(self):
123
+ """Get similarity function.
124
+
125
+ Returns:
126
+ similarity function (function).
127
+ """
128
+ return self.sim_func
129
+
130
+ def set_gap_start(self, gap_start):
131
+ """Set gap start cost.
132
+
133
+ Args:
134
+ gap_start (float): Cost for the gap at the start.
135
+ """
136
+ self.gap_start = gap_start
137
+ return True
138
+
139
+ def set_gap_continuation(self, gap_continuation):
140
+ """Set gap continuation cost.
141
+
142
+ Args:
143
+ gap_continuation (float): Cost for the gap continuation.
144
+ """
145
+ self.gap_continuation = gap_continuation
146
+ return True
147
+
148
+ def set_sim_func(self, sim_func):
149
+ """Set similarity function.
150
+
151
+ Args:
152
+ sim_func (function): Function computing similarity score between two characters, represented as strings.
153
+ """
154
+ self.sim_func = sim_func
155
+ return True
@@ -0,0 +1,86 @@
1
+ import math
2
+
3
+ from py_stringmatching import utils
4
+ from py_stringmatching.similarity_measure.token_similarity_measure import \
5
+ TokenSimilarityMeasure
6
+
7
+
8
+ class Cosine(TokenSimilarityMeasure):
9
+ """Computes a variant of cosine measure known as Ochiai coefficient.
10
+
11
+ This is not the cosine measure that computes the cosine of the angle between two given vectors. Rather, it computes a variant of cosine measure known as Ochiai coefficient (see the Wikipedia page "Cosine Similarity"). Specifically, for two sets X and Y, this measure computes:
12
+
13
+ :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
14
+ """
15
+
16
+ def __init__(self):
17
+ super(Cosine, self).__init__()
18
+
19
+ def get_raw_score(self, set1, set2):
20
+ """Computes the raw cosine score between two sets.
21
+
22
+ Args:
23
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
24
+
25
+ Returns:
26
+ Cosine similarity (float)
27
+
28
+ Raises:
29
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
30
+
31
+ Examples:
32
+ >>> cos = Cosine()
33
+ >>> cos.get_raw_score(['data', 'science'], ['data'])
34
+ 0.7071067811865475
35
+ >>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management'])
36
+ 0.4999999999999999
37
+ >>> cos.get_raw_score([], ['data'])
38
+ 0.0
39
+
40
+ References:
41
+ * String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference).
42
+ * Project Flamingo at http://flamingo.ics.uci.edu.
43
+ """
44
+ # input validations
45
+ utils.sim_check_for_none(set1, set2)
46
+ utils.sim_check_for_list_or_set_inputs(set1, set2)
47
+
48
+ # if exact match return 1.0
49
+ if utils.sim_check_for_exact_match(set1, set2):
50
+ return 1.0
51
+
52
+ # if one of the strings is empty return 0
53
+ if utils.sim_check_for_empty(set1, set2):
54
+ return 0
55
+
56
+ if not isinstance(set1, set):
57
+ set1 = set(set1)
58
+ if not isinstance(set2, set):
59
+ set2 = set(set2)
60
+
61
+ return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) *
62
+ math.sqrt(float(len(set2))))
63
+
64
+ def get_sim_score(self, set1, set2):
65
+ """Computes the normalized cosine similarity between two sets.
66
+
67
+ Args:
68
+ set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
69
+
70
+ Returns:
71
+ Normalized cosine similarity (float)
72
+
73
+ Raises:
74
+ TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
75
+
76
+ Examples:
77
+ >>> cos = Cosine()
78
+ >>> cos.get_sim_score(['data', 'science'], ['data'])
79
+ 0.7071067811865475
80
+ >>> cos.get_sim_score(['data', 'data', 'science'], ['data', 'management'])
81
+ 0.4999999999999999
82
+ >>> cos.get_sim_score([], ['data'])
83
+ 0.0
84
+
85
+ """
86
+ return self.get_raw_score(set1, set2)