py-stringmatching 0.1.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. py_stringmatching-0.1.0/AUTHORS.rst +6 -0
  2. py_stringmatching-0.1.0/CHANGES.txt +6 -0
  3. py_stringmatching-0.1.0/LICENSE +27 -0
  4. py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
  5. py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
  6. py_stringmatching-0.1.0/MANIFEST.in +6 -0
  7. py_stringmatching-0.1.0/PKG-INFO +57 -0
  8. py_stringmatching-0.1.0/README.rst +27 -0
  9. py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
  10. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
  11. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
  12. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
  13. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
  14. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
  15. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
  16. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
  17. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
  18. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
  19. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
  20. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
  21. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
  22. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
  23. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
  24. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
  25. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
  26. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
  27. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
  28. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
  29. py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
  30. py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
  31. py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
  32. py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
  33. py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
  34. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
  35. py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
  36. py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
  37. py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
  38. py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
  39. py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
  40. py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
  41. py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
  42. py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
  43. py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
  44. py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
  45. py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
  46. py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
  47. py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
  48. py_stringmatching-0.1.0/requirements.txt +2 -0
  49. py_stringmatching-0.1.0/setup.cfg +5 -0
  50. py_stringmatching-0.1.0/setup.py +107 -0
@@ -0,0 +1,57 @@
1
+ from py_stringmatching import utils
2
+ from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
3
+
4
+
5
+ class WhitespaceTokenizer(DelimiterTokenizer):
6
+ """Segments the input string using whitespaces then returns the segments as tokens.
7
+
8
+ Currently using the split function in Python, so whitespace character refers to
9
+ the actual whitespace character as well as the tab and newline characters.
10
+
11
+ Args:
12
+ return_set (boolean): A flag to indicate whether to return a set of
13
+ tokens instead of a bag of tokens (defaults to False).
14
+
15
+ Attributes:
16
+ return_set (boolean): An attribute to store the flag return_set.
17
+ """
18
+
19
+ def __init__(self, return_set=False):
20
+ super(WhitespaceTokenizer, self).__init__([' ', '\t', '\n'], return_set)
21
+
22
+ def tokenize(self, input_string):
23
+ """Tokenizes input string based on white space.
24
+
25
+ Args:
26
+ input_string (str): The string to be tokenized.
27
+
28
+ Returns:
29
+ A Python list, which is a set or a bag of tokens, depending on whether return_set is True or False.
30
+
31
+ Raises:
32
+ TypeError : If the input is not a string.
33
+
34
+ Examples:
35
+ >>> ws_tok = WhitespaceTokenizer()
36
+ >>> ws_tok.tokenize('data science')
37
+ ['data', 'science']
38
+ >>> ws_tok.tokenize('data science')
39
+ ['data', 'science']
40
+ >>> ws_tok.tokenize('data\tscience')
41
+ ['data', 'science']
42
+ >>> ws_tok = WhitespaceTokenizer(return_set=True)
43
+ >>> ws_tok.tokenize('data science data integration')
44
+ ['data', 'science', 'integration']
45
+ """
46
+ utils.tok_check_for_none(input_string)
47
+ utils.tok_check_for_string_input(input_string)
48
+
49
+ token_list = list(filter(None, input_string.split()))
50
+
51
+ if self.return_set:
52
+ return utils.convert_bag_to_set(token_list)
53
+
54
+ return token_list
55
+
56
+ def set_delim_set(self, delim_set):
57
+ raise AttributeError('Delimiters cannot be set for WhitespaceTokenizer')
@@ -0,0 +1,70 @@
1
+ import functools
2
+
3
+ import six
4
+
5
+ """
6
+ This module defines a list of utility and validation functions.
7
+ """
8
+
9
+
10
+ def sim_check_for_none(*args):
11
+ if len(args) > 0 and args[0] is None:
12
+ raise TypeError("First argument cannot be None")
13
+ if len(args) > 1 and args[1] is None:
14
+ raise TypeError("Second argument cannot be None")
15
+
16
+
17
+ def sim_check_for_empty(*args):
18
+ if len(args[0]) == 0 or len(args[1]) == 0:
19
+ return True
20
+
21
+
22
+ def sim_check_for_same_len(*args):
23
+ if len(args[0]) != len(args[1]):
24
+ raise ValueError("Undefined for sequences of unequal length")
25
+
26
+
27
+ def sim_check_for_string_inputs(*args):
28
+ if not isinstance(args[0], six.string_types):
29
+ raise TypeError('First argument is expected to be a string')
30
+ if not isinstance(args[1], six.string_types):
31
+ raise TypeError('Second argument is expected to be a string')
32
+
33
+
34
+ def sim_check_for_list_or_set_inputs(*args):
35
+ if not isinstance(args[0], list):
36
+ if not isinstance(args[0], set):
37
+ raise TypeError('First argument is expected to be a python list or set')
38
+ if not isinstance(args[1], list):
39
+ if not isinstance(args[1], set):
40
+ raise TypeError('Second argument is expected to be a python list or set')
41
+
42
+
43
+
44
+
45
+ def sim_check_for_exact_match(*args):
46
+ if args[0] == args[1]:
47
+ return True
48
+
49
+
50
+
51
+
52
+ def tok_check_for_string_input(*args):
53
+ for i in range(len(args)):
54
+ if not isinstance(args[i], six.string_types):
55
+ raise TypeError('Input is expected to be a string')
56
+
57
+
58
+ def tok_check_for_none(*args):
59
+ if args[0] is None:
60
+ raise TypeError("First argument cannot be None")
61
+
62
+
63
+ def convert_bag_to_set(input_list):
64
+ seen_tokens = {}
65
+ output_set =[]
66
+ for token in input_list:
67
+ if seen_tokens.get(token) == None:
68
+ output_set.append(token)
69
+ seen_tokens[token] = True
70
+ return output_set
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 1.1
2
+ Name: py-stringmatching
3
+ Version: 0.1.0
4
+ Summary: Python library for string matching.
5
+ Home-page: https://sites.google.com/site/anhaidgroup/projects/py_stringmatching
6
+ Author: UW Magellan Team
7
+ Author-email: uwmagellan@gmail.com
8
+ License: BSD
9
+ Description: py_stringmatching
10
+ =================
11
+
12
+ py_stringmatching is a software package in Python that consists of a comprehensive set of tokenizers and string similarity measures (
13
+ such as edit distance, Jaccard, and TF/IDF). It is free, open-source, and BSD-licensed.
14
+
15
+ Important links
16
+ ===============
17
+
18
+ * Repository: https://github.com/anhaidgroup/py_stringmatching
19
+ * Documentation: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/index.html
20
+ * Tutorial: http://anhaidgroup.github.io/py_stringmatching/v0.1.x/Tutorial.html
21
+ * Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
22
+ * Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
23
+
24
+ Dependencies
25
+ ============
26
+
27
+ py_stringmatching is tested to work under Python 2.7, Python 3.3, Python 3.4 and Python 3.5.
28
+
29
+ The required dependencies to build the package are NumPy >= 1.7.0,
30
+ Six and a working C/C++ compiler. For the development version, you will also require Cython.
31
+
32
+ Platforms
33
+ =========
34
+
35
+ py_stringmatching has been tested on Linux, OSX and Windows.
36
+
37
+ Platform: UNKNOWN
38
+ Classifier: Development Status :: 4 - Beta
39
+ Classifier: Environment :: Console
40
+ Classifier: Intended Audience :: Developers
41
+ Classifier: Intended Audience :: Science/Research
42
+ Classifier: Intended Audience :: Education
43
+ Classifier: License :: OSI Approved :: BSD License
44
+ Classifier: Operating System :: POSIX
45
+ Classifier: Operating System :: Unix
46
+ Classifier: Operating System :: MacOS
47
+ Classifier: Operating System :: Microsoft :: Windows
48
+ Classifier: Programming Language :: Python
49
+ Classifier: Programming Language :: Python :: 2
50
+ Classifier: Programming Language :: Python :: 3
51
+ Classifier: Programming Language :: Python :: 2.7
52
+ Classifier: Programming Language :: Python :: 3.3
53
+ Classifier: Programming Language :: Python :: 3.4
54
+ Classifier: Programming Language :: Python :: 3.5
55
+ Classifier: Topic :: Scientific/Engineering
56
+ Classifier: Topic :: Utilities
57
+ Classifier: Topic :: Software Development :: Libraries
@@ -0,0 +1,48 @@
1
+ AUTHORS.rst
2
+ CHANGES.txt
3
+ LICENSE
4
+ MANIFEST.in
5
+ README.rst
6
+ requirements.txt
7
+ setup.py
8
+ LICENSES/NUMPY_LICENSE
9
+ LICENSES/SIX_LICENSE
10
+ py_stringmatching/__init__.py
11
+ py_stringmatching/utils.py
12
+ py_stringmatching.egg-info/PKG-INFO
13
+ py_stringmatching.egg-info/SOURCES.txt
14
+ py_stringmatching.egg-info/dependency_links.txt
15
+ py_stringmatching.egg-info/not-zip-safe
16
+ py_stringmatching.egg-info/requires.txt
17
+ py_stringmatching.egg-info/top_level.txt
18
+ py_stringmatching/similarity_measure/__init__.py
19
+ py_stringmatching/similarity_measure/affine.py
20
+ py_stringmatching/similarity_measure/cosine.py
21
+ py_stringmatching/similarity_measure/cython_levenshtein.c
22
+ py_stringmatching/similarity_measure/dice.py
23
+ py_stringmatching/similarity_measure/hamming_distance.py
24
+ py_stringmatching/similarity_measure/hybrid_similarity_measure.py
25
+ py_stringmatching/similarity_measure/jaccard.py
26
+ py_stringmatching/similarity_measure/jaro.py
27
+ py_stringmatching/similarity_measure/jaro_winkler.py
28
+ py_stringmatching/similarity_measure/levenshtein.py
29
+ py_stringmatching/similarity_measure/monge_elkan.py
30
+ py_stringmatching/similarity_measure/needleman_wunsch.py
31
+ py_stringmatching/similarity_measure/overlap_coefficient.py
32
+ py_stringmatching/similarity_measure/sequence_similarity_measure.py
33
+ py_stringmatching/similarity_measure/similarity_measure.py
34
+ py_stringmatching/similarity_measure/smith_waterman.py
35
+ py_stringmatching/similarity_measure/soft_tfidf.py
36
+ py_stringmatching/similarity_measure/tfidf.py
37
+ py_stringmatching/similarity_measure/token_similarity_measure.py
38
+ py_stringmatching/tests/__init__.py
39
+ py_stringmatching/tests/test_simfunctions.py
40
+ py_stringmatching/tests/test_tokenizers.py
41
+ py_stringmatching/tokenizer/__init__.py
42
+ py_stringmatching/tokenizer/alphabetic_tokenizer.py
43
+ py_stringmatching/tokenizer/alphanumeric_tokenizer.py
44
+ py_stringmatching/tokenizer/definition_tokenizer.py
45
+ py_stringmatching/tokenizer/delimiter_tokenizer.py
46
+ py_stringmatching/tokenizer/qgram_tokenizer.py
47
+ py_stringmatching/tokenizer/tokenizer.py
48
+ py_stringmatching/tokenizer/whitespace_tokenizer.py
@@ -0,0 +1,2 @@
1
+ numpy >= 1.7.0
2
+ six
@@ -0,0 +1 @@
1
+ py_stringmatching
@@ -0,0 +1,2 @@
1
+ numpy>=1.7.0
2
+ six
@@ -0,0 +1,5 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+ tag_svn_revision = 0
5
+
@@ -0,0 +1,107 @@
1
+ import subprocess
2
+ import sys
3
+ import os
4
+
5
+ # check if pip is installed. If not, raise an ImportError
6
+ PIP_INSTALLED = True
7
+
8
+ try:
9
+ import pip
10
+ except ImportError:
11
+ PIP_INSTALLED = False
12
+
13
+ if not PIP_INSTALLED:
14
+ raise ImportError('pip is not installed.')
15
+
16
+ def install_and_import(package):
17
+ import importlib
18
+ try:
19
+ importlib.import_module(package)
20
+ except ImportError:
21
+ pip.main(['install', package])
22
+ finally:
23
+ globals()[package] = importlib.import_module(package)
24
+
25
+ # check if setuptools is installed. If not, install setuptools
26
+ # automatically using pip.
27
+ install_and_import('setuptools')
28
+
29
+ # make sure numpy is installed, as we need numpy to compile the C extensions.
30
+ # If numpy is not installed, automatically install it using pip.
31
+ install_and_import('numpy')
32
+
33
+ def generate_cython():
34
+ cwd = os.path.abspath(os.path.dirname(__file__))
35
+ print("Cythonizing sources")
36
+ p = subprocess.call([sys.executable, os.path.join(cwd,
37
+ 'build_tools',
38
+ 'cythonize.py'),
39
+ 'py_stringmatching'],
40
+ cwd=cwd)
41
+ if p != 0:
42
+ raise RuntimeError("Running cythonize failed!")
43
+
44
+ if __name__ == "__main__":
45
+
46
+ no_frills = (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or
47
+ sys.argv[1] in ('--help-commands',
48
+ 'egg_info', '--version',
49
+ 'clean')))
50
+
51
+ cwd = os.path.abspath(os.path.dirname(__file__))
52
+ if not os.path.exists(os.path.join(cwd, 'PKG-INFO')) and not no_frills:
53
+ # Generate Cython sources, unless building from source release
54
+ generate_cython()
55
+
56
+ # specify extensions that need to be compiled
57
+ extensions = [setuptools.Extension("py_stringmatching.similarity_measure.cython_levenshtein",
58
+ ["py_stringmatching/similarity_measure/cython_levenshtein.c"],
59
+ include_dirs=[numpy.get_include()])]
60
+
61
+ # find packages to be included. exclude benchmarks.
62
+ packages = setuptools.find_packages(exclude=["benchmarks"])
63
+
64
+ with open('README.rst') as f:
65
+ LONG_DESCRIPTION = f.read()
66
+
67
+ setuptools.setup(
68
+ name='py_stringmatching',
69
+ version='0.1.0',
70
+ description='Python library for string matching.',
71
+ long_description=LONG_DESCRIPTION,
72
+ url='https://sites.google.com/site/anhaidgroup/projects/py_stringmatching',
73
+ author='UW Magellan Team',
74
+ author_email='uwmagellan@gmail.com',
75
+ license='BSD',
76
+ classifiers=[
77
+ 'Development Status :: 4 - Beta',
78
+ 'Environment :: Console',
79
+ 'Intended Audience :: Developers',
80
+ 'Intended Audience :: Science/Research',
81
+ 'Intended Audience :: Education',
82
+ 'License :: OSI Approved :: BSD License',
83
+ 'Operating System :: POSIX',
84
+ 'Operating System :: Unix',
85
+ 'Operating System :: MacOS',
86
+ 'Operating System :: Microsoft :: Windows',
87
+ 'Programming Language :: Python',
88
+ 'Programming Language :: Python :: 2',
89
+ 'Programming Language :: Python :: 3',
90
+ 'Programming Language :: Python :: 2.7',
91
+ 'Programming Language :: Python :: 3.3',
92
+ 'Programming Language :: Python :: 3.4',
93
+ 'Programming Language :: Python :: 3.5',
94
+ 'Topic :: Scientific/Engineering',
95
+ 'Topic :: Utilities',
96
+ 'Topic :: Software Development :: Libraries',
97
+ ],
98
+ packages=packages,
99
+ install_requires=[
100
+ 'numpy >= 1.7.0',
101
+ 'six'
102
+ ],
103
+ ext_modules=extensions,
104
+ include_package_data=True,
105
+ zip_safe=False
106
+ )
107
+