py-stringmatching 0.1.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_stringmatching-0.1.0/AUTHORS.rst +6 -0
- py_stringmatching-0.1.0/CHANGES.txt +6 -0
- py_stringmatching-0.1.0/LICENSE +27 -0
- py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
- py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
- py_stringmatching-0.1.0/MANIFEST.in +6 -0
- py_stringmatching-0.1.0/PKG-INFO +57 -0
- py_stringmatching-0.1.0/README.rst +27 -0
- py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
- py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
- py_stringmatching-0.1.0/requirements.txt +2 -0
- py_stringmatching-0.1.0/setup.cfg +5 -0
- py_stringmatching-0.1.0/setup.py +107 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class WhitespaceTokenizer(DelimiterTokenizer):
|
|
6
|
+
"""Segments the input string using whitespaces then returns the segments as tokens.
|
|
7
|
+
|
|
8
|
+
Currently using the split function in Python, so whitespace character refers to
|
|
9
|
+
the actual whitespace character as well as the tab and newline characters.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
return_set (boolean): A flag to indicate whether to return a set of
|
|
13
|
+
tokens instead of a bag of tokens (defaults to False).
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
return_set (boolean): An attribute to store the flag return_set.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, return_set=False):
|
|
20
|
+
super(WhitespaceTokenizer, self).__init__([' ', '\t', '\n'], return_set)
|
|
21
|
+
|
|
22
|
+
def tokenize(self, input_string):
|
|
23
|
+
"""Tokenizes input string based on white space.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
input_string (str): The string to be tokenized.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
A Python list, which is a set or a bag of tokens, depending on whether return_set is True or False.
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
TypeError : If the input is not a string.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
>>> ws_tok = WhitespaceTokenizer()
|
|
36
|
+
>>> ws_tok.tokenize('data science')
|
|
37
|
+
['data', 'science']
|
|
38
|
+
>>> ws_tok.tokenize('data science')
|
|
39
|
+
['data', 'science']
|
|
40
|
+
>>> ws_tok.tokenize('data\tscience')
|
|
41
|
+
['data', 'science']
|
|
42
|
+
>>> ws_tok = WhitespaceTokenizer(return_set=True)
|
|
43
|
+
>>> ws_tok.tokenize('data science data integration')
|
|
44
|
+
['data', 'science', 'integration']
|
|
45
|
+
"""
|
|
46
|
+
utils.tok_check_for_none(input_string)
|
|
47
|
+
utils.tok_check_for_string_input(input_string)
|
|
48
|
+
|
|
49
|
+
token_list = list(filter(None, input_string.split()))
|
|
50
|
+
|
|
51
|
+
if self.return_set:
|
|
52
|
+
return utils.convert_bag_to_set(token_list)
|
|
53
|
+
|
|
54
|
+
return token_list
|
|
55
|
+
|
|
56
|
+
def set_delim_set(self, delim_set):
|
|
57
|
+
raise AttributeError('Delimiters cannot be set for WhitespaceTokenizer')
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
|
|
3
|
+
import six
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This module defines a list of utility and validation functions.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def sim_check_for_none(*args):
|
|
11
|
+
if len(args) > 0 and args[0] is None:
|
|
12
|
+
raise TypeError("First argument cannot be None")
|
|
13
|
+
if len(args) > 1 and args[1] is None:
|
|
14
|
+
raise TypeError("Second argument cannot be None")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def sim_check_for_empty(*args):
|
|
18
|
+
if len(args[0]) == 0 or len(args[1]) == 0:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def sim_check_for_same_len(*args):
|
|
23
|
+
if len(args[0]) != len(args[1]):
|
|
24
|
+
raise ValueError("Undefined for sequences of unequal length")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def sim_check_for_string_inputs(*args):
|
|
28
|
+
if not isinstance(args[0], six.string_types):
|
|
29
|
+
raise TypeError('First argument is expected to be a string')
|
|
30
|
+
if not isinstance(args[1], six.string_types):
|
|
31
|
+
raise TypeError('Second argument is expected to be a string')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def sim_check_for_list_or_set_inputs(*args):
|
|
35
|
+
if not isinstance(args[0], list):
|
|
36
|
+
if not isinstance(args[0], set):
|
|
37
|
+
raise TypeError('First argument is expected to be a python list or set')
|
|
38
|
+
if not isinstance(args[1], list):
|
|
39
|
+
if not isinstance(args[1], set):
|
|
40
|
+
raise TypeError('Second argument is expected to be a python list or set')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def sim_check_for_exact_match(*args):
|
|
46
|
+
if args[0] == args[1]:
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def tok_check_for_string_input(*args):
|
|
53
|
+
for i in range(len(args)):
|
|
54
|
+
if not isinstance(args[i], six.string_types):
|
|
55
|
+
raise TypeError('Input is expected to be a string')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def tok_check_for_none(*args):
|
|
59
|
+
if args[0] is None:
|
|
60
|
+
raise TypeError("First argument cannot be None")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def convert_bag_to_set(input_list):
|
|
64
|
+
seen_tokens = {}
|
|
65
|
+
output_set =[]
|
|
66
|
+
for token in input_list:
|
|
67
|
+
if seen_tokens.get(token) == None:
|
|
68
|
+
output_set.append(token)
|
|
69
|
+
seen_tokens[token] = True
|
|
70
|
+
return output_set
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Metadata-Version: 1.1
|
|
2
|
+
Name: py-stringmatching
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python library for string matching.
|
|
5
|
+
Home-page: https://sites.google.com/site/anhaidgroup/projects/py_stringmatching
|
|
6
|
+
Author: UW Magellan Team
|
|
7
|
+
Author-email: uwmagellan@gmail.com
|
|
8
|
+
License: BSD
|
|
9
|
+
Description: py_stringmatching
|
|
10
|
+
=================
|
|
11
|
+
|
|
12
|
+
py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
|
|
13
|
+
such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
|
|
14
|
+
|
|
15
|
+
Important links
|
|
16
|
+
===============
|
|
17
|
+
|
|
18
|
+
* Repository: https://github.com/anhaidgroup/py_stringmatching
|
|
19
|
+
* Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
|
|
20
|
+
* Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
|
|
21
|
+
* Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
|
|
22
|
+
* Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
|
|
23
|
+
|
|
24
|
+
Dependencies
|
|
25
|
+
============
|
|
26
|
+
|
|
27
|
+
py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
|
|
28
|
+
|
|
29
|
+
The required dependencies to build the package are NumPy >= 1.7.0,
|
|
30
|
+
Six and a working C/C++ compiler. For the development version, you will also require Cython.
|
|
31
|
+
|
|
32
|
+
Platforms
|
|
33
|
+
=========
|
|
34
|
+
|
|
35
|
+
py_stringmatching has been tested on Linux, OSX and Windows.
|
|
36
|
+
|
|
37
|
+
Platform: UNKNOWN
|
|
38
|
+
Classifier: Development Status :: 4 - Beta
|
|
39
|
+
Classifier: Environment :: Console
|
|
40
|
+
Classifier: Intended Audience :: Developers
|
|
41
|
+
Classifier: Intended Audience :: Science/Research
|
|
42
|
+
Classifier: Intended Audience :: Education
|
|
43
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
44
|
+
Classifier: Operating System :: POSIX
|
|
45
|
+
Classifier: Operating System :: Unix
|
|
46
|
+
Classifier: Operating System :: MacOS
|
|
47
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
48
|
+
Classifier: Programming Language :: Python
|
|
49
|
+
Classifier: Programming Language :: Python :: 2
|
|
50
|
+
Classifier: Programming Language :: Python :: 3
|
|
51
|
+
Classifier: Programming Language :: Python :: 2.7
|
|
52
|
+
Classifier: Programming Language :: Python :: 3.3
|
|
53
|
+
Classifier: Programming Language :: Python :: 3.4
|
|
54
|
+
Classifier: Programming Language :: Python :: 3.5
|
|
55
|
+
Classifier: Topic :: Scientific/Engineering
|
|
56
|
+
Classifier: Topic :: Utilities
|
|
57
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
AUTHORS.rst
|
|
2
|
+
CHANGES.txt
|
|
3
|
+
LICENSE
|
|
4
|
+
MANIFEST.in
|
|
5
|
+
README.rst
|
|
6
|
+
requirements.txt
|
|
7
|
+
setup.py
|
|
8
|
+
LICENSES/NUMPY_LICENSE
|
|
9
|
+
LICENSES/SIX_LICENSE
|
|
10
|
+
py_stringmatching/__init__.py
|
|
11
|
+
py_stringmatching/utils.py
|
|
12
|
+
py_stringmatching.egg-info/PKG-INFO
|
|
13
|
+
py_stringmatching.egg-info/SOURCES.txt
|
|
14
|
+
py_stringmatching.egg-info/dependency_links.txt
|
|
15
|
+
py_stringmatching.egg-info/not-zip-safe
|
|
16
|
+
py_stringmatching.egg-info/requires.txt
|
|
17
|
+
py_stringmatching.egg-info/top_level.txt
|
|
18
|
+
py_stringmatching/similarity_measure/__init__.py
|
|
19
|
+
py_stringmatching/similarity_measure/affine.py
|
|
20
|
+
py_stringmatching/similarity_measure/cosine.py
|
|
21
|
+
py_stringmatching/similarity_measure/cython_levenshtein.c
|
|
22
|
+
py_stringmatching/similarity_measure/dice.py
|
|
23
|
+
py_stringmatching/similarity_measure/hamming_distance.py
|
|
24
|
+
py_stringmatching/similarity_measure/hybrid_similarity_measure.py
|
|
25
|
+
py_stringmatching/similarity_measure/jaccard.py
|
|
26
|
+
py_stringmatching/similarity_measure/jaro.py
|
|
27
|
+
py_stringmatching/similarity_measure/jaro_winkler.py
|
|
28
|
+
py_stringmatching/similarity_measure/levenshtein.py
|
|
29
|
+
py_stringmatching/similarity_measure/monge_elkan.py
|
|
30
|
+
py_stringmatching/similarity_measure/needleman_wunsch.py
|
|
31
|
+
py_stringmatching/similarity_measure/overlap_coefficient.py
|
|
32
|
+
py_stringmatching/similarity_measure/sequence_similarity_measure.py
|
|
33
|
+
py_stringmatching/similarity_measure/similarity_measure.py
|
|
34
|
+
py_stringmatching/similarity_measure/smith_waterman.py
|
|
35
|
+
py_stringmatching/similarity_measure/soft_tfidf.py
|
|
36
|
+
py_stringmatching/similarity_measure/tfidf.py
|
|
37
|
+
py_stringmatching/similarity_measure/token_similarity_measure.py
|
|
38
|
+
py_stringmatching/tests/__init__.py
|
|
39
|
+
py_stringmatching/tests/test_simfunctions.py
|
|
40
|
+
py_stringmatching/tests/test_tokenizers.py
|
|
41
|
+
py_stringmatching/tokenizer/__init__.py
|
|
42
|
+
py_stringmatching/tokenizer/alphabetic_tokenizer.py
|
|
43
|
+
py_stringmatching/tokenizer/alphanumeric_tokenizer.py
|
|
44
|
+
py_stringmatching/tokenizer/definition_tokenizer.py
|
|
45
|
+
py_stringmatching/tokenizer/delimiter_tokenizer.py
|
|
46
|
+
py_stringmatching/tokenizer/qgram_tokenizer.py
|
|
47
|
+
py_stringmatching/tokenizer/tokenizer.py
|
|
48
|
+
py_stringmatching/tokenizer/whitespace_tokenizer.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
py_stringmatching
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# check if pip is installed. If not, raise an ImportError
|
|
6
|
+
PIP_INSTALLED = True
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import pip
|
|
10
|
+
except ImportError:
|
|
11
|
+
PIP_INSTALLED = False
|
|
12
|
+
|
|
13
|
+
if not PIP_INSTALLED:
|
|
14
|
+
raise ImportError('pip is not installed.')
|
|
15
|
+
|
|
16
|
+
def install_and_import(package):
|
|
17
|
+
import importlib
|
|
18
|
+
try:
|
|
19
|
+
importlib.import_module(package)
|
|
20
|
+
except ImportError:
|
|
21
|
+
pip.main(['install', package])
|
|
22
|
+
finally:
|
|
23
|
+
globals()[package] = importlib.import_module(package)
|
|
24
|
+
|
|
25
|
+
# check if setuptools is installed. If not, install setuptools
|
|
26
|
+
# automatically using pip.
|
|
27
|
+
install_and_import('setuptools')
|
|
28
|
+
|
|
29
|
+
# make sure numpy is installed, as we need numpy to compile the C extensions.
|
|
30
|
+
# If numpy is not installed, automatically install it using pip.
|
|
31
|
+
install_and_import('numpy')
|
|
32
|
+
|
|
33
|
+
def generate_cython():
|
|
34
|
+
cwd = os.path.abspath(os.path.dirname(__file__))
|
|
35
|
+
print("Cythonizing sources")
|
|
36
|
+
p = subprocess.call([sys.executable, os.path.join(cwd,
|
|
37
|
+
'build_tools',
|
|
38
|
+
'cythonize.py'),
|
|
39
|
+
'py_stringmatching'],
|
|
40
|
+
cwd=cwd)
|
|
41
|
+
if p != 0:
|
|
42
|
+
raise RuntimeError("Running cythonize failed!")
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
|
|
46
|
+
no_frills = (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or
|
|
47
|
+
sys.argv[1] in ('--help-commands',
|
|
48
|
+
'egg_info', '--version',
|
|
49
|
+
'clean')))
|
|
50
|
+
|
|
51
|
+
cwd = os.path.abspath(os.path.dirname(__file__))
|
|
52
|
+
if not os.path.exists(os.path.join(cwd, 'PKG-INFO')) and not no_frills:
|
|
53
|
+
# Generate Cython sources, unless building from source release
|
|
54
|
+
generate_cython()
|
|
55
|
+
|
|
56
|
+
# specify extensions that need to be compiled
|
|
57
|
+
extensions = [setuptools.Extension("py_stringmatching.similarity_measure.cython_levenshtein",
|
|
58
|
+
["py_stringmatching/similarity_measure/cython_levenshtein.c"],
|
|
59
|
+
include_dirs=[numpy.get_include()])]
|
|
60
|
+
|
|
61
|
+
# find packages to be included. exclude benchmarks.
|
|
62
|
+
packages = setuptools.find_packages(exclude=["benchmarks"])
|
|
63
|
+
|
|
64
|
+
with open('README.rst') as f:
|
|
65
|
+
LONG_DESCRIPTION = f.read()
|
|
66
|
+
|
|
67
|
+
setuptools.setup(
|
|
68
|
+
name='py_stringmatching',
|
|
69
|
+
version='0.1.0',
|
|
70
|
+
description='Python library for string matching.',
|
|
71
|
+
long_description=LONG_DESCRIPTION,
|
|
72
|
+
url='https://sites.google.com/site/anhaidgroup/projects/py_stringmatching',
|
|
73
|
+
author='UW Magellan Team',
|
|
74
|
+
author_email='uwmagellan@gmail.com',
|
|
75
|
+
license='BSD',
|
|
76
|
+
classifiers=[
|
|
77
|
+
'Development Status :: 4 - Beta',
|
|
78
|
+
'Environment :: Console',
|
|
79
|
+
'Intended Audience :: Developers',
|
|
80
|
+
'Intended Audience :: Science/Research',
|
|
81
|
+
'Intended Audience :: Education',
|
|
82
|
+
'License :: OSI Approved :: BSD License',
|
|
83
|
+
'Operating System :: POSIX',
|
|
84
|
+
'Operating System :: Unix',
|
|
85
|
+
'Operating System :: MacOS',
|
|
86
|
+
'Operating System :: Microsoft :: Windows',
|
|
87
|
+
'Programming Language :: Python',
|
|
88
|
+
'Programming Language :: Python :: 2',
|
|
89
|
+
'Programming Language :: Python :: 3',
|
|
90
|
+
'Programming Language :: Python :: 2.7',
|
|
91
|
+
'Programming Language :: Python :: 3.3',
|
|
92
|
+
'Programming Language :: Python :: 3.4',
|
|
93
|
+
'Programming Language :: Python :: 3.5',
|
|
94
|
+
'Topic :: Scientific/Engineering',
|
|
95
|
+
'Topic :: Utilities',
|
|
96
|
+
'Topic :: Software Development :: Libraries',
|
|
97
|
+
],
|
|
98
|
+
packages=packages,
|
|
99
|
+
install_requires=[
|
|
100
|
+
'numpy >= 1.7.0',
|
|
101
|
+
'six'
|
|
102
|
+
],
|
|
103
|
+
ext_modules=extensions,
|
|
104
|
+
include_package_data=True,
|
|
105
|
+
zip_safe=False
|
|
106
|
+
)
|
|
107
|
+
|