py-stringmatching 0.1.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_stringmatching-0.1.0/AUTHORS.rst +6 -0
- py_stringmatching-0.1.0/CHANGES.txt +6 -0
- py_stringmatching-0.1.0/LICENSE +27 -0
- py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
- py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
- py_stringmatching-0.1.0/MANIFEST.in +6 -0
- py_stringmatching-0.1.0/PKG-INFO +57 -0
- py_stringmatching-0.1.0/README.rst +27 -0
- py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
- py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
- py_stringmatching-0.1.0/requirements.txt +2 -0
- py_stringmatching-0.1.0/setup.cfg +5 -0
- py_stringmatching-0.1.0/setup.py +107 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
The following individuals have contributed code, documentation, or expertise to py_stringmatching:
|
|
2
|
+
|
|
3
|
+
* `Ali Hitawala <https://github.com/alihitawala>`_
|
|
4
|
+
* `Paul Suganthan G. C. <https://github.com/paulgc>`_
|
|
5
|
+
* `Pradap Konda <https://github.com/kvpradap>`_
|
|
6
|
+
* `AnHai Doan <https://github.com/anhaidgroup>`_
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
v0.1.0 - 06/14/2016
|
|
2
|
+
* Initial release.
|
|
3
|
+
* Contains 5 tokenizers - Alphabetic tokenizer, Alphanumeric tokenizer, Delimiter tokenizer, Qgram tokenizer and
|
|
4
|
+
Whitespace tokenizer.
|
|
5
|
+
* Contains 14 similarity measures - Affine, Cosine, Dice, Hamming distance, Jaccard, Jaro, Jaro-Winkler,
|
|
6
|
+
Levenshtein, Monge-Elkan, Needleman-Wunsch, Overlap coefficient, Smith-Waterman, Soft TF-IDF, and TF-IDF.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Copyright (c) 2016, anhaidgroup
|
|
2
|
+
All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
|
6
|
+
|
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
8
|
+
list of conditions and the following disclaimer.
|
|
9
|
+
|
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
|
12
|
+
and/or other materials provided with the distribution.
|
|
13
|
+
|
|
14
|
+
* Neither the name of py_stringmatching nor the names of its
|
|
15
|
+
contributors may be used to endorse or promote products derived from
|
|
16
|
+
this software without specific prior written permission.
|
|
17
|
+
|
|
18
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
19
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
20
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
21
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
22
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
23
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
24
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
25
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
26
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
27
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Copyright (c) 2005-2016, NumPy Developers.
|
|
2
|
+
All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
|
5
|
+
modification, are permitted provided that the following conditions are
|
|
6
|
+
met:
|
|
7
|
+
|
|
8
|
+
* Redistributions of source code must retain the above copyright
|
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
* Redistributions in binary form must reproduce the above
|
|
12
|
+
copyright notice, this list of conditions and the following
|
|
13
|
+
disclaimer in the documentation and/or other materials provided
|
|
14
|
+
with the distribution.
|
|
15
|
+
|
|
16
|
+
* Neither the name of the NumPy Developers nor the names of any
|
|
17
|
+
contributors may be used to endorse or promote products derived
|
|
18
|
+
from this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
21
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
22
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
23
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
24
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
25
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
26
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
27
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
28
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
29
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
30
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Copyright (c) 2010-2016 Benjamin Peterson
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
5
|
+
the Software without restriction, including without limitation the rights to
|
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
7
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
8
|
+
subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
15
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
16
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Metadata-Version: 1.1
|
|
2
|
+
Name: py_stringmatching
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python library for string matching.
|
|
5
|
+
Home-page: https://sites.google.com/site/anhaidgroup/projects/py_stringmatching
|
|
6
|
+
Author: UW Magellan Team
|
|
7
|
+
Author-email: uwmagellan@gmail.com
|
|
8
|
+
License: BSD
|
|
9
|
+
Description: py_stringmatching
|
|
10
|
+
=================
|
|
11
|
+
|
|
12
|
+
py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
|
|
13
|
+
such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
|
|
14
|
+
|
|
15
|
+
Important links
|
|
16
|
+
===============
|
|
17
|
+
|
|
18
|
+
* Repository: https://github.com/anhaidgroup/py_stringmatching
|
|
19
|
+
* Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
|
|
20
|
+
* Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
|
|
21
|
+
* Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
|
|
22
|
+
* Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
|
|
23
|
+
|
|
24
|
+
Dependencies
|
|
25
|
+
============
|
|
26
|
+
|
|
27
|
+
py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
|
|
28
|
+
|
|
29
|
+
The required dependencies to build the package are NumPy >= 1.7.0,
|
|
30
|
+
Six and a working C/C++ compiler. For the development version, you will also require Cython.
|
|
31
|
+
|
|
32
|
+
Platforms
|
|
33
|
+
=========
|
|
34
|
+
|
|
35
|
+
py_stringmatching has been tested on Linux, OSX and Windows.
|
|
36
|
+
|
|
37
|
+
Platform: UNKNOWN
|
|
38
|
+
Classifier: Development Status :: 4 - Beta
|
|
39
|
+
Classifier: Environment :: Console
|
|
40
|
+
Classifier: Intended Audience :: Developers
|
|
41
|
+
Classifier: Intended Audience :: Science/Research
|
|
42
|
+
Classifier: Intended Audience :: Education
|
|
43
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
44
|
+
Classifier: Operating System :: POSIX
|
|
45
|
+
Classifier: Operating System :: Unix
|
|
46
|
+
Classifier: Operating System :: MacOS
|
|
47
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
48
|
+
Classifier: Programming Language :: Python
|
|
49
|
+
Classifier: Programming Language :: Python :: 2
|
|
50
|
+
Classifier: Programming Language :: Python :: 3
|
|
51
|
+
Classifier: Programming Language :: Python :: 2.7
|
|
52
|
+
Classifier: Programming Language :: Python :: 3.3
|
|
53
|
+
Classifier: Programming Language :: Python :: 3.4
|
|
54
|
+
Classifier: Programming Language :: Python :: 3.5
|
|
55
|
+
Classifier: Topic :: Scientific/Engineering
|
|
56
|
+
Classifier: Topic :: Utilities
|
|
57
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
py_stringmatching
|
|
2
|
+
=================
|
|
3
|
+
|
|
4
|
+
py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
|
|
5
|
+
such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
|
|
6
|
+
|
|
7
|
+
Important links
|
|
8
|
+
===============
|
|
9
|
+
|
|
10
|
+
* Repository: https://github.com/anhaidgroup/py_stringmatching
|
|
11
|
+
* Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
|
|
12
|
+
* Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
|
|
13
|
+
* Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
|
|
14
|
+
* Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
|
|
15
|
+
|
|
16
|
+
Dependencies
|
|
17
|
+
============
|
|
18
|
+
|
|
19
|
+
py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
|
|
20
|
+
|
|
21
|
+
The required dependencies to build the package are NumPy >= 1.7.0,
|
|
22
|
+
Six and a working C/C++ compiler. For the development version, you will also require Cython.
|
|
23
|
+
|
|
24
|
+
Platforms
|
|
25
|
+
=========
|
|
26
|
+
|
|
27
|
+
py_stringmatching has been tested on Linux, OSX and Windows.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
# Import tokenizers
|
|
4
|
+
from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer
|
|
5
|
+
from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer
|
|
6
|
+
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
|
|
7
|
+
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
|
|
8
|
+
from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer
|
|
9
|
+
|
|
10
|
+
# Import similarity measures
|
|
11
|
+
from py_stringmatching.similarity_measure.affine import Affine
|
|
12
|
+
from py_stringmatching.similarity_measure.cosine import Cosine
|
|
13
|
+
from py_stringmatching.similarity_measure.dice import Dice
|
|
14
|
+
from py_stringmatching.similarity_measure.hamming_distance import HammingDistance
|
|
15
|
+
from py_stringmatching.similarity_measure.jaccard import Jaccard
|
|
16
|
+
from py_stringmatching.similarity_measure.jaro import Jaro
|
|
17
|
+
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
|
|
18
|
+
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
|
|
19
|
+
from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
|
|
20
|
+
from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
|
|
21
|
+
from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
|
|
22
|
+
from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
|
|
23
|
+
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
|
|
24
|
+
from py_stringmatching.similarity_measure.tfidf import TfIdf
|
|
25
|
+
|
|
File without changes
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from py_stringmatching import utils
|
|
4
|
+
from six.moves import xrange
|
|
5
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
6
|
+
SequenceSimilarityMeasure
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def sim_ident(char1, char2):
|
|
10
|
+
return int(char1 == char2)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Affine(SequenceSimilarityMeasure):
|
|
14
|
+
"""Returns the affine gap score between two strings.
|
|
15
|
+
|
|
16
|
+
The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
|
|
17
|
+
gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration").
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
gap_start (float): Cost for the gap at the start (defaults to 1).
|
|
21
|
+
gap_continuation (float): Cost for the gap continuation (defaults to 0.5).
|
|
22
|
+
sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults
|
|
23
|
+
to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise).
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
gap_start (float): An attribute to store the gap cost at the start.
|
|
27
|
+
gap_continuation (float): An attribute to store the gap continuation cost.
|
|
28
|
+
sim_func (function): An attribute to store the similarity function.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=sim_ident):
|
|
32
|
+
self.gap_start = gap_start
|
|
33
|
+
self.gap_continuation = gap_continuation
|
|
34
|
+
self.sim_func = sim_func
|
|
35
|
+
super(Affine, self).__init__()
|
|
36
|
+
|
|
37
|
+
def get_raw_score(self, string1, string2):
|
|
38
|
+
"""Computes the affine gap score between two strings. This score can be outside the range [0,1].
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
string1,string2 (str) : Input strings.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Affine gap score betwen the two input strings (float).
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
>>> aff = Affine()
|
|
51
|
+
>>> aff.get_raw_score('dva', 'deeva')
|
|
52
|
+
1.5
|
|
53
|
+
>>> aff = Affine(gap_start=2, gap_continuation=0.5)
|
|
54
|
+
>>> aff.get_raw_score('dva', 'deeve')
|
|
55
|
+
-0.5
|
|
56
|
+
>>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
|
|
57
|
+
>>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
|
|
58
|
+
4.4
|
|
59
|
+
"""
|
|
60
|
+
# input validations
|
|
61
|
+
utils.sim_check_for_none(string1, string2)
|
|
62
|
+
utils.tok_check_for_string_input(string1, string2)
|
|
63
|
+
|
|
64
|
+
# if one of the strings is empty return 0
|
|
65
|
+
if utils.sim_check_for_empty(string1, string2):
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
gap_start = -self.gap_start
|
|
69
|
+
gap_continuation = -self.gap_continuation
|
|
70
|
+
m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
|
|
71
|
+
x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
|
|
72
|
+
y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
|
|
73
|
+
|
|
74
|
+
# DP initialization
|
|
75
|
+
for i in xrange(1, len(string1) + 1):
|
|
76
|
+
m[i][0] = -float("inf")
|
|
77
|
+
x[i][0] = gap_start + (i - 1) * gap_continuation
|
|
78
|
+
y[i][0] = -float("inf")
|
|
79
|
+
|
|
80
|
+
# DP initialization
|
|
81
|
+
for j in xrange(1, len(string2) + 1):
|
|
82
|
+
m[0][j] = -float("inf")
|
|
83
|
+
x[0][j] = -float("inf")
|
|
84
|
+
y[0][j] = gap_start + (j - 1) * gap_continuation
|
|
85
|
+
|
|
86
|
+
# affine gap calculation using DP
|
|
87
|
+
for i in xrange(1, len(string1) + 1):
|
|
88
|
+
for j in xrange(1, len(string2) + 1):
|
|
89
|
+
# best score between x_1....x_i and y_1....y_j
|
|
90
|
+
# given that x_i is aligned to y_j
|
|
91
|
+
m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) +
|
|
92
|
+
max(m[i - 1][j - 1], x[i - 1][j - 1],
|
|
93
|
+
y[i - 1][j - 1]))
|
|
94
|
+
|
|
95
|
+
# the best score given that x_i is aligned to a gap
|
|
96
|
+
x[i][j] = max(gap_start + m[i - 1][j],
|
|
97
|
+
gap_continuation + x[i - 1][j])
|
|
98
|
+
|
|
99
|
+
# the best score given that y_j is aligned to a gap
|
|
100
|
+
y[i][j] = max(gap_start + m[i][j - 1],
|
|
101
|
+
gap_continuation + y[i][j - 1])
|
|
102
|
+
|
|
103
|
+
return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)],
|
|
104
|
+
y[len(string1)][len(string2)])
|
|
105
|
+
|
|
106
|
+
def get_gap_start(self):
|
|
107
|
+
"""Get gap start cost.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
gap start cost (float).
|
|
111
|
+
"""
|
|
112
|
+
return self.gap_start
|
|
113
|
+
|
|
114
|
+
def get_gap_continuation(self):
|
|
115
|
+
"""Get gap continuation cost.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
gap continuation cost (float).
|
|
119
|
+
"""
|
|
120
|
+
return self.gap_continuation
|
|
121
|
+
|
|
122
|
+
def get_sim_func(self):
|
|
123
|
+
"""Get similarity function.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
similarity function (function).
|
|
127
|
+
"""
|
|
128
|
+
return self.sim_func
|
|
129
|
+
|
|
130
|
+
def set_gap_start(self, gap_start):
|
|
131
|
+
"""Set gap start cost.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
gap_start (float): Cost for the gap at the start.
|
|
135
|
+
"""
|
|
136
|
+
self.gap_start = gap_start
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
def set_gap_continuation(self, gap_continuation):
|
|
140
|
+
"""Set gap continuation cost.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
gap_continuation (float): Cost for the gap continuation.
|
|
144
|
+
"""
|
|
145
|
+
self.gap_continuation = gap_continuation
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
def set_sim_func(self, sim_func):
|
|
149
|
+
"""Set similarity function.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
sim_func (function): Function computing similarity score between two characters, represented as strings.
|
|
153
|
+
"""
|
|
154
|
+
self.sim_func = sim_func
|
|
155
|
+
return True
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
from py_stringmatching import utils
|
|
4
|
+
from py_stringmatching.similarity_measure.token_similarity_measure import \
|
|
5
|
+
TokenSimilarityMeasure
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Cosine(TokenSimilarityMeasure):
|
|
9
|
+
"""Computes a variant of cosine measure known as Ochiai coefficient.
|
|
10
|
+
|
|
11
|
+
This is not the cosine measure that computes the cosine of the angle between two given vectors. Rather, it computes a variant of cosine measure known as Ochiai coefficient (see the Wikipedia page "Cosine Similarity"). Specifically, for two sets X and Y, this measure computes:
|
|
12
|
+
|
|
13
|
+
:math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
super(Cosine, self).__init__()
|
|
18
|
+
|
|
19
|
+
def get_raw_score(self, set1, set2):
|
|
20
|
+
"""Computes the raw cosine score between two sets.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Cosine similarity (float)
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
>>> cos = Cosine()
|
|
33
|
+
>>> cos.get_raw_score(['data', 'science'], ['data'])
|
|
34
|
+
0.7071067811865475
|
|
35
|
+
>>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management'])
|
|
36
|
+
0.4999999999999999
|
|
37
|
+
>>> cos.get_raw_score([], ['data'])
|
|
38
|
+
0.0
|
|
39
|
+
|
|
40
|
+
References:
|
|
41
|
+
* String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference).
|
|
42
|
+
* Project Flamingo at http://flamingo.ics.uci.edu.
|
|
43
|
+
"""
|
|
44
|
+
# input validations
|
|
45
|
+
utils.sim_check_for_none(set1, set2)
|
|
46
|
+
utils.sim_check_for_list_or_set_inputs(set1, set2)
|
|
47
|
+
|
|
48
|
+
# if exact match return 1.0
|
|
49
|
+
if utils.sim_check_for_exact_match(set1, set2):
|
|
50
|
+
return 1.0
|
|
51
|
+
|
|
52
|
+
# if one of the strings is empty return 0
|
|
53
|
+
if utils.sim_check_for_empty(set1, set2):
|
|
54
|
+
return 0
|
|
55
|
+
|
|
56
|
+
if not isinstance(set1, set):
|
|
57
|
+
set1 = set(set1)
|
|
58
|
+
if not isinstance(set2, set):
|
|
59
|
+
set2 = set(set2)
|
|
60
|
+
|
|
61
|
+
return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) *
|
|
62
|
+
math.sqrt(float(len(set2))))
|
|
63
|
+
|
|
64
|
+
def get_sim_score(self, set1, set2):
|
|
65
|
+
"""Computes the normalized cosine similarity between two sets.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Normalized cosine similarity (float)
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
>>> cos = Cosine()
|
|
78
|
+
>>> cos.get_sim_score(['data', 'science'], ['data'])
|
|
79
|
+
0.7071067811865475
|
|
80
|
+
>>> cos.get_sim_score(['data', 'data', 'science'], ['data', 'management'])
|
|
81
|
+
0.4999999999999999
|
|
82
|
+
>>> cos.get_sim_score([], ['data'])
|
|
83
|
+
0.0
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
return self.get_raw_score(set1, set2)
|