nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,441 @@
|
|
1
|
+
# Import relevant libraries and dependencies
|
2
|
+
from typing import List, Union, Dict, Tuple
|
3
|
+
import numpy as np
|
4
|
+
from .wasserstein import WassersteinDistance
|
5
|
+
|
6
|
+
"""
|
7
|
+
add string2string module code, src = https://github.com/stanfordnlp/string2string
|
8
|
+
|
9
|
+
MIT License
|
10
|
+
|
11
|
+
Copyright (c) 2023 Mirac Suzgun
|
12
|
+
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
15
|
+
in the Software without restriction, including without limitation the rights
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
18
|
+
furnished to do so, subject to the following conditions:
|
19
|
+
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
21
|
+
copies or substantial portions of the Software.
|
22
|
+
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
29
|
+
SOFTWARE.
|
30
|
+
"""
|
31
|
+
|
32
|
+
# Parent class for all the string algorithms implemented in this module
|
33
|
+
class StringAlgs:
|
34
|
+
"""
|
35
|
+
This class is the parent class for all the string algorithms implemented in this module.
|
36
|
+
"""
|
37
|
+
# Initialize the class
|
38
|
+
def __init__(self,
|
39
|
+
match_weight: float = 0.0,
|
40
|
+
) -> None:
|
41
|
+
# Set the match weight
|
42
|
+
self.match_weight = match_weight
|
43
|
+
|
44
|
+
# Levenshtein edit distance class
|
45
|
+
class LevenshteinEditDistance(StringAlgs):
|
46
|
+
def __init__(self,
|
47
|
+
match_weight: float = 0.0,
|
48
|
+
insert_weight: float = 1.0,
|
49
|
+
delete_weight: float = 1.0,
|
50
|
+
substitute_weight: float = 1.0,
|
51
|
+
) -> None:
|
52
|
+
r"""
|
53
|
+
This class initializes the Levenshtein edit distance algorithm. Levenshtein edit distance represents the minimum number of edit distance operations (insertion, deletion, and substitution) required to convert one string to another.
|
54
|
+
|
55
|
+
The Levenshtein edit distance (with unit cost for each edit distance operation) is given by the following recurrence relation:
|
56
|
+
|
57
|
+
.. math::
|
58
|
+
:nowrap:
|
59
|
+
|
60
|
+
\begin{align}
|
61
|
+
d[i, j] := \min( & d[i-1, j-1] + \texttt{mismatch}(i, j), \\
|
62
|
+
& d[i-1, j] + 1, \\
|
63
|
+
& d[i, j-1] + 1),
|
64
|
+
\end{align}
|
65
|
+
|
66
|
+
where :math:`\texttt{mismatch}(i, j)` is 1 if the i-th element in str1 is not equal to the j-th element in str2, and 0 otherwise.
|
67
|
+
|
68
|
+
Arguments:
|
69
|
+
match_weight (float): The weight of a match (default: 0.0).
|
70
|
+
insert_weight (float): The weight of an insertion (default: 1.0).
|
71
|
+
delete_weight (float): The weight of a deletion (default: 1.0).
|
72
|
+
substitute_weight (float): The weight of a substitution (default: 1.0).
|
73
|
+
|
74
|
+
Raises:
|
75
|
+
AssertionError: If any of the weights are negative.
|
76
|
+
"""
|
77
|
+
# Set the match weight
|
78
|
+
super().__init__(match_weight=match_weight)
|
79
|
+
|
80
|
+
# Set the insert, delete, and substite weights
|
81
|
+
self.insert_weight = insert_weight
|
82
|
+
self.delete_weight = delete_weight
|
83
|
+
self.substitute_weight = substitute_weight
|
84
|
+
|
85
|
+
# Assert that all the weights are non-negative
|
86
|
+
assert min(match_weight, insert_weight, delete_weight, substitute_weight) >= 0.0
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
# Compute the Levenshtein edit distance between two strings using recursion
|
91
|
+
def compute_recursive(self,
|
92
|
+
str1: Union[str, List[str]],
|
93
|
+
str2: Union[str, List[str]],
|
94
|
+
) -> float:
|
95
|
+
r"""
|
96
|
+
This function computes the Levenshtein edit distance between two strings (or lists of strings) using recursion.
|
97
|
+
|
98
|
+
Arguments:
|
99
|
+
str1 (str or list of str): The first string (or list of strings).
|
100
|
+
str2 (str or list of str): The second string (or list of strings).
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
The Levenshtein edit distance between the two strings.
|
104
|
+
|
105
|
+
.. note::
|
106
|
+
* The solution presented here utilizes recursion to compute the Levenshtein edit distance between two strings. It has an exponential time complexity and is not recommended for pairs of strings with a large length.
|
107
|
+
* The time complexity of this function is :math:`O(3^{m+n})`, where :math:`m` and :math:`n` are the lengths of the two strings.
|
108
|
+
"""
|
109
|
+
# Base case
|
110
|
+
if len(str1) == 0:
|
111
|
+
return len(str2) * self.insert_weight
|
112
|
+
elif len(str2) == 0:
|
113
|
+
return len(str1) * self.delete_weight
|
114
|
+
|
115
|
+
# Compute the mismatch
|
116
|
+
mismatch = 0.0 if str1[-1] == str2[-1] else self.substitute_weight
|
117
|
+
|
118
|
+
# Compute the Levenshtein edit distance
|
119
|
+
return min(
|
120
|
+
self.compute_recursive(str1[:-1], str2[:-1]) + mismatch,
|
121
|
+
self.compute_recursive(str1[:-1], str2) + self.delete_weight,
|
122
|
+
self.compute_recursive(str1, str2[:-1]) + self.insert_weight,
|
123
|
+
)
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
# Compute the Levenshtein edit distance between two strings using memoization
|
128
|
+
def compute_recursive_memoization(self,
|
129
|
+
str1: Union[str, List[str]],
|
130
|
+
str2: Union[str, List[str]],
|
131
|
+
) -> float:
|
132
|
+
r"""
|
133
|
+
This function computes the Levenshtein edit distance between two strings (or lists of strings) using memoization.
|
134
|
+
|
135
|
+
Arguments:
|
136
|
+
str1 (str or list of str): The first string (or list of strings).
|
137
|
+
str2 (str or list of str): The second string (or list of strings).
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
The Levenshtein edit distance between the two strings.
|
141
|
+
|
142
|
+
.. note::
|
143
|
+
* The solution presented here utilizes memoization to compute the Levenshtein edit distance between two strings.
|
144
|
+
* The time complexity of this function is :math:`\mathcal{O}(m n)`, where :math:`m` and :math:`n` are the lengths of the two strings.
|
145
|
+
"""
|
146
|
+
# Initialize the memoization dictionary
|
147
|
+
memoization = {}
|
148
|
+
|
149
|
+
# Compute the Levenshtein edit distance
|
150
|
+
return self.compute_memoization_helper(str1, str2, memoization)
|
151
|
+
|
152
|
+
|
153
|
+
|
154
|
+
# Compute the Levenshtein edit distance between two strings using memoization (helper function)
|
155
|
+
def compute_memoization_helper(self,
|
156
|
+
str1: Union[str, List[str]],
|
157
|
+
str2: Union[str, List[str]],
|
158
|
+
memoization: Dict[Tuple[str, str], float],
|
159
|
+
) -> float:
|
160
|
+
r"""
|
161
|
+
This is a helper function that computes the Levenshtein edit distance between two strings (or lists of strings) using memoization.
|
162
|
+
|
163
|
+
Arguments:
|
164
|
+
str1 (str or list of str): The first string (or list of strings).
|
165
|
+
str2 (str or list of str): The second string (or list of strings).
|
166
|
+
memoization (dict): The memoization dictionary.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
The Levenshtein edit distance between the two strings.
|
170
|
+
|
171
|
+
.. note::
|
172
|
+
* The solution presented here utilizes memoization to compute the Levenshtein edit distance between two strings.
|
173
|
+
* One can also use the :func:`functools.lru_cache` (@lru_cache()) decorator to memoize the function calls. However, for the sake of educational purposes, we have implemented memoization using a dictionary.
|
174
|
+
* The time complexity of this function is quadratic, that is :math:`\mathcal{O}(nm)`, where m and n are the lengths of the two strings.
|
175
|
+
"""
|
176
|
+
# Base case
|
177
|
+
if len(str1) == 0:
|
178
|
+
return len(str2) * self.insert_weight
|
179
|
+
elif len(str2) == 0:
|
180
|
+
return len(str1) * self.delete_weight
|
181
|
+
|
182
|
+
# Check if the Levenshtein edit distance has already been computed
|
183
|
+
if (str1, str2) in memoization:
|
184
|
+
return memoization[(str1, str2)]
|
185
|
+
|
186
|
+
# Compute the mismatch
|
187
|
+
mismatch = 0.0 if str1[-1] == str2[-1] else self.substitute_weight
|
188
|
+
|
189
|
+
# Compute the Levenshtein edit distance
|
190
|
+
memoization[(str1, str2)] = min(
|
191
|
+
self.compute_memoization_helper(str1[:-1], str2[:-1], memoization) + mismatch,
|
192
|
+
self.compute_memoization_helper(str1[:-1], str2, memoization) + self.delete_weight,
|
193
|
+
self.compute_memoization_helper(str1, str2[:-1], memoization) + self.insert_weight,
|
194
|
+
)
|
195
|
+
|
196
|
+
# Return the Levenshtein edit distance
|
197
|
+
return memoization[(str1, str2)]
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
# Compute the Levenshtein edit distance between two strings using dynamic programming
|
202
|
+
def compute_dynamic_programming(self,
|
203
|
+
str1: Union[str, List[str]],
|
204
|
+
str2: Union[str, List[str]],
|
205
|
+
) -> float:
|
206
|
+
r"""
|
207
|
+
This function computes the Levenshtein edit distance between two strings (or lists of strings) using dynamic programming (Wagner-Fischer algorithm).
|
208
|
+
|
209
|
+
Arguments:
|
210
|
+
str1 (str or list of str): The first string (or list of strings).
|
211
|
+
str2 (str or list of str): The second string (or list of strings).
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
The Levenshtein edit distance between the two strings.
|
215
|
+
|
216
|
+
.. note::
|
217
|
+
* The solution presented here utilizes dynamic programming principles to compute the Levenshtein edit distance between two strings.
|
218
|
+
* This solution is also known as the Wagner-Fischer algorithm. [WF1974]_
|
219
|
+
* The time complexity of this dynamic-programming-based solution is :math:`\mathcal{O}(nm)`, and the space complexity is :math:`\mathcal{O}(nm)`, where n and m are the lengths of the two strings, respectively.
|
220
|
+
* However, by using only two rows of the distance matrix at a time, the space complexity of the dynamic programming solution can be reduced to :math:`\mathcal{O}(min(n, m))`.
|
221
|
+
* The time complexity cannot be made strongly subquadratic time unless SETH is false. [BI2015]_
|
222
|
+
* Finally, we note that this solution can be extended to cases where each edit distance operation has a non-unit cost.
|
223
|
+
|
224
|
+
.. [WF1974] Wagner, R.A. and Fischer, M.J., 1974. The string-to-string correction problem. Journal of the ACM (JACM), 21(1), pp.168-173.
|
225
|
+
.. [BI2015] Backurs, A. and Indyk, P., 2015, June. Edit distance cannot be computed in strongly subquadratic time (unless SETH is false). In Proceedings of the forty-seventh annual ACM symposium on Theory of computing (pp. 51-58).
|
226
|
+
"""
|
227
|
+
# Lengths of strings str1 and str2, respectively.
|
228
|
+
n = len(str1)
|
229
|
+
m = len(str2)
|
230
|
+
|
231
|
+
# Initialize the distance matrix.
|
232
|
+
dist = np.zeros((n + 1, m + 1))
|
233
|
+
for i in range(1, n + 1):
|
234
|
+
dist[i, 0] = self.delete_weight * i
|
235
|
+
for j in range(1, m + 1):
|
236
|
+
dist[0, j] = self.insert_weight * j
|
237
|
+
|
238
|
+
# Dynamic programming step, where each operation has a unit cost:
|
239
|
+
# d[i, j] := min(d[i-1, j-1] + mismatch(i, j), d[i-1, j] + 1, d[i, j-1] + 1),
|
240
|
+
# where mismatch(i, j) is 1 if str1[i] != str2[j] and 0 otherwise.
|
241
|
+
for i in range(1, n + 1):
|
242
|
+
for j in range(1, m + 1):
|
243
|
+
# Compute the minimum edit distance between str1[:i] and str2[:j].
|
244
|
+
dist[i, j] = min(
|
245
|
+
dist[i-1, j-1] + (self.substitute_weight if str1[i-1] != str2[j-1] else self.match_weight),
|
246
|
+
dist[i-1, j] + self.delete_weight,
|
247
|
+
dist[i, j-1] + self.insert_weight,
|
248
|
+
)
|
249
|
+
|
250
|
+
# Return the Levenshtein edit distance between str1 and str2.
|
251
|
+
return dist[n, m]
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
# Compute the Levenshtein edit distance between two strings
|
256
|
+
def compute(self,
|
257
|
+
str1: Union[str, List[str]],
|
258
|
+
str2: Union[str, List[str]],
|
259
|
+
method: str = "dynamic-programming",
|
260
|
+
) -> float:
|
261
|
+
r"""
|
262
|
+
This function computes the Levenshtein edit distance between two strings (or lists of strings), using the method specified by the user.
|
263
|
+
|
264
|
+
Arguments:
|
265
|
+
str1 (str or list of str): The first string (or list of strings).
|
266
|
+
str2 (str or list of str): The second string (or list of strings).
|
267
|
+
method (str): The method to use to compute the Levenshtein edit distance (default: "dynamic-programming").
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
The Levenshtein edit distance between the two strings.
|
271
|
+
|
272
|
+
.. note::
|
273
|
+
* The method can be one of the following:
|
274
|
+
* "recursive": This method computes the Levenshtein edit distance using recursion.
|
275
|
+
* "recursive-memoization": This method computes the Levenshtein edit distance using recursion with memoization.
|
276
|
+
* "dynamic-programming": This method computes the Levenshtein edit distance using dynamic programming (Wagner-Fischer algorithm).
|
277
|
+
* By default, the method is "dynamic-programming".
|
278
|
+
|
279
|
+
"""
|
280
|
+
# If the method is dynamic programming, then compute the Levenshtein edit distance using dynamic programming
|
281
|
+
if method == "recursive":
|
282
|
+
return self.compute_recursive(str1, str2)
|
283
|
+
elif method == "recursive-memoization":
|
284
|
+
return self.compute_recursive_memoization(str1, str2)
|
285
|
+
return self.compute_dynamic_programming(str1, str2)
|
286
|
+
|
287
|
+
# Hamming (edit) distance class
|
288
|
+
class HammingDistance(StringAlgs):
|
289
|
+
def __init__(self,
|
290
|
+
match_weight: float = 0.0,
|
291
|
+
substitute_weight: float = 1.0,
|
292
|
+
) -> None:
|
293
|
+
r"""
|
294
|
+
This function initializes the class variables of the Hamming distance.
|
295
|
+
|
296
|
+
The Hamming distance is the number of positions at which the corresponding symbols are different. [H1950]_
|
297
|
+
|
298
|
+
Arguments:
|
299
|
+
match_weight (float): The weight of a match (default: 0.0).
|
300
|
+
substitute_weight (float): The weight of a substitution (default: 1.0).
|
301
|
+
|
302
|
+
Raises:
|
303
|
+
AssertionError: If the substite weight is negative.
|
304
|
+
|
305
|
+
.. note::
|
306
|
+
* The Hamming distance has a time complexity of :math:`\mathcal{O}(n)`, where :math: `n` the length of the two strings.
|
307
|
+
|
308
|
+
.. [H1950] Hamming, R.W., 1968. Error detecting and error correcting codes. Bell System Technical Journal, 29(2), pp.147-160.
|
309
|
+
"""
|
310
|
+
# Set the match weight
|
311
|
+
super().__init__(match_weight=match_weight)
|
312
|
+
|
313
|
+
# Set the substite weight
|
314
|
+
self.substitute_weight = substitute_weight
|
315
|
+
|
316
|
+
# Assert that the substite weight is non-negative
|
317
|
+
assert substitute_weight >= 0.0
|
318
|
+
|
319
|
+
|
320
|
+
|
321
|
+
# Compute the Hamming distance between two strings
|
322
|
+
def compute(self,
|
323
|
+
str1: Union[str, List[str]],
|
324
|
+
str2: Union[str, List[str]],
|
325
|
+
) -> float:
|
326
|
+
"""
|
327
|
+
This function computes the Hamming distance between two strings (or lists of strings).
|
328
|
+
|
329
|
+
Arguments:
|
330
|
+
str1 (str or list of str): The first string (or list of strings).
|
331
|
+
str2 (str or list of str): The second string (or list of strings).
|
332
|
+
|
333
|
+
Returns:
|
334
|
+
The Hamming distance between the two strings.
|
335
|
+
|
336
|
+
Raises:
|
337
|
+
ValueError: If the two strings (or lists of strings) have different lengths.
|
338
|
+
"""
|
339
|
+
|
340
|
+
# Lengths of strings str1 and str2, respectively.
|
341
|
+
n = len(str1)
|
342
|
+
m = len(str2)
|
343
|
+
|
344
|
+
# Assert that the two strings have the same length
|
345
|
+
if n != m:
|
346
|
+
raise ValueError("The two strings (or lists of strings) must have the same length.")
|
347
|
+
|
348
|
+
# Compute the Hamming edit distance between str1 and str2.
|
349
|
+
return sum(
|
350
|
+
self.substitute_weight if str1[i] != str2[i] else self.match_weight
|
351
|
+
for i in range(n)
|
352
|
+
)
|
353
|
+
|
354
|
+
|
355
|
+
# Damerau-Levenshtein edit distance class
|
356
|
+
class DamerauLevenshteinDistance(LevenshteinEditDistance):
|
357
|
+
def __init__(self,
|
358
|
+
match_weight: float = 0.0,
|
359
|
+
insert_weight: float = 1.0,
|
360
|
+
delete_weight: float = 1.0,
|
361
|
+
substitute_weight: float = 1.0,
|
362
|
+
adjacent_transpose_weight: float = 1.0,
|
363
|
+
) -> None:
|
364
|
+
r"""
|
365
|
+
This function initializes the class variables of the Damerau-Levenshtein distance.
|
366
|
+
|
367
|
+
The Damerau-Levenshtein distance is the minimum number of insertions, deletions, substitutions, and transpositions required to transform one string into the other. [D1964]_
|
368
|
+
|
369
|
+
Arguments:
|
370
|
+
match_weight (float): The weight of a match (default: 0.0).
|
371
|
+
insert_weight (float): The weight of an insertion (default: 1.0).
|
372
|
+
delete_weight (float): The weight of a deletion (default: 1.0).
|
373
|
+
substitute_weight (float): The weight of a substitution (default: 1.0).
|
374
|
+
adjacent_transpose_weight (float): The weight of an adjacent transposition (default: 1.0).
|
375
|
+
|
376
|
+
Raises:
|
377
|
+
AssertionError: If the insert, delete, substite, or adjacent transpose weights are negative.
|
378
|
+
|
379
|
+
.. [D1964] Damerau, F.J., 1964. A technique for computer detection and correction of spelling errors. Communications of the ACM, 7(3), pp.171-176.
|
380
|
+
"""
|
381
|
+
# Set the weights of the distance operations
|
382
|
+
super().__init__(
|
383
|
+
match_weight=match_weight,
|
384
|
+
insert_weight=insert_weight,
|
385
|
+
delete_weight=delete_weight,
|
386
|
+
substitute_weight=substitute_weight,
|
387
|
+
)
|
388
|
+
|
389
|
+
# Set the adjacent transpose weight
|
390
|
+
self.adjacent_transpose_weight = adjacent_transpose_weight
|
391
|
+
|
392
|
+
# Assert that the adjacent transpose weight is non-negative
|
393
|
+
assert adjacent_transpose_weight >= 0.0
|
394
|
+
|
395
|
+
|
396
|
+
|
397
|
+
# Compute the Damerau-Levenshtein edit distance between two strings
|
398
|
+
def compute(self,
|
399
|
+
str1: Union[str, List[str]],
|
400
|
+
str2: Union[str, List[str]],
|
401
|
+
) -> float:
|
402
|
+
"""
|
403
|
+
This function computes the Damerau-Levenshtein edit distance between two strings (or lists of strings).
|
404
|
+
|
405
|
+
Arguments:
|
406
|
+
str1 (str or list of str): The first string (or list of strings).
|
407
|
+
str2 (str or list of str): The second string (or list of strings).
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
The Damerau-Levenshtein distance between the two strings.
|
411
|
+
|
412
|
+
.. note::
|
413
|
+
* The Damerau-Levenshtein distance is a variant of the Levenshtein distance that allows for adjacent transpositions.
|
414
|
+
* The dynamic programming solution to the Damerau-Levenshtein distance has a time complexity of :math:`\mathcal{O}(nm)`, where n and m are the lengths of the two strings.
|
415
|
+
"""
|
416
|
+
|
417
|
+
# Lengths of strings str1 and str2, respectively.
|
418
|
+
n = len(str1)
|
419
|
+
m = len(str2)
|
420
|
+
|
421
|
+
# Initialize the distance matrix.
|
422
|
+
dist = np.zeros((n + 1, m + 1))
|
423
|
+
for i in range(1, n + 1):
|
424
|
+
dist[i, 0] = self.delete_weight * i
|
425
|
+
for j in range(1, m + 1):
|
426
|
+
dist[0, j] = self.insert_weight * j
|
427
|
+
|
428
|
+
# Dynamic programming solution to the Damerau-Levenshtein edit distance is very similar to that of the Levenshtein edit distance.
|
429
|
+
for i in range(1, n + 1):
|
430
|
+
for j in range(1, m + 1):
|
431
|
+
dist[i, j] = min(
|
432
|
+
dist[i-1, j-1] + (self.substitute_weight if str1[i-1] != str2[j-1] else self.match_weight),
|
433
|
+
dist[i-1, j] + self.delete_weight,
|
434
|
+
dist[i, j-1] + self.insert_weight,
|
435
|
+
)
|
436
|
+
# This is the only difference between the Damerau-Levenshtein edit distance and the Levenshtein edit distance.
|
437
|
+
if i > 1 and j > 1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]:
|
438
|
+
dist[i, j] = min(dist[i, j], dist[i-2, j-2] + self.adjacent_transpose_weight)
|
439
|
+
|
440
|
+
# Return the Damerau-Levenshtein edit distance between str1 and str2.
|
441
|
+
return dist[n, m]
|
@@ -0,0 +1,126 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
"""
|
4
|
+
source code = https://visualstudiomagazine.com/articles/2021/08/16/wasserstein-distance.aspx
|
5
|
+
By James McCaffrey
|
6
|
+
"""
|
7
|
+
|
8
|
+
from nltkor.make_requirement import make_requirement
|
9
|
+
try:
|
10
|
+
import torch
|
11
|
+
except ImportError:
|
12
|
+
requirement = ['torch']
|
13
|
+
file_path = make_requirement(requirement)
|
14
|
+
raise Exception(f"""
|
15
|
+
Need to install Libraries, please pip install below libraries
|
16
|
+
\t pip install torch
|
17
|
+
Or, use pip install requirement.txt
|
18
|
+
\t pip install -r {file_path}
|
19
|
+
""")
|
20
|
+
|
21
|
+
|
22
|
+
class WassersteinDistance:
|
23
|
+
def __init__(self) -> None:
|
24
|
+
pass
|
25
|
+
|
26
|
+
def first_nonzero(self, vec):
|
27
|
+
dim = len(vec)
|
28
|
+
for i in range(dim):
|
29
|
+
if vec[i] > 0.0:
|
30
|
+
return i
|
31
|
+
return -1 # no empty cells found
|
32
|
+
|
33
|
+
def move_dirt(self, dirt, di, holes, hi):
|
34
|
+
# move as much dirt at [di] as possible to h[hi]
|
35
|
+
if dirt[di] <= holes[hi]: # use all dirt
|
36
|
+
flow = dirt[di]
|
37
|
+
dirt[di] = 0.0 # all dirt got moved
|
38
|
+
holes[hi] -= flow # less to fill now
|
39
|
+
elif dirt[di] > holes[hi]: # use just part of dirt
|
40
|
+
flow = holes[hi] # fill remainder of hole
|
41
|
+
dirt[di] -= flow # less dirt left
|
42
|
+
holes[hi] = 0.0 # hole is filled
|
43
|
+
dist = np.abs(di - hi)
|
44
|
+
return flow * dist # work
|
45
|
+
|
46
|
+
def compute_wasserstein(self, p, q):
|
47
|
+
if "torch" in str(type(p)):
|
48
|
+
p = p.numpy()
|
49
|
+
if "torch" in str(type(q)):
|
50
|
+
q = q.numpy()
|
51
|
+
|
52
|
+
dirt = np.copy(p)
|
53
|
+
holes = np.copy(q)
|
54
|
+
tot_work = 0.0
|
55
|
+
|
56
|
+
while True: # TODO: add sanity counter check
|
57
|
+
from_idx = self.first_nonzero(dirt)
|
58
|
+
to_idx = self.first_nonzero(holes)
|
59
|
+
if from_idx == -1 or to_idx == -1:
|
60
|
+
break
|
61
|
+
work = self.move_dirt(dirt, from_idx, holes, to_idx)
|
62
|
+
tot_work += work
|
63
|
+
return tot_work
|
64
|
+
|
65
|
+
def kullback_leibler(self, p, q):
|
66
|
+
n = len(p)
|
67
|
+
sum = 0.0
|
68
|
+
for i in range(n):
|
69
|
+
sum += p[i] * np.log(p[i] / q[i])
|
70
|
+
return sum
|
71
|
+
|
72
|
+
def compute_kullback(self, p, q):
|
73
|
+
if "torch" in str(type(p)):
|
74
|
+
p = p.numpy()
|
75
|
+
if "torch" in str(type(q)):
|
76
|
+
q = q.numpy()
|
77
|
+
a = self.kullback_leibler(p, q)
|
78
|
+
b = self.kullback_leibler(q, p)
|
79
|
+
return a + b
|
80
|
+
|
81
|
+
def compute_jesson_shannon(self, p, q):
|
82
|
+
if "torch" in str(type(p)):
|
83
|
+
p = p.numpy()
|
84
|
+
if "torch" in str(type(q)):
|
85
|
+
q = q.numpy()
|
86
|
+
|
87
|
+
a = self.kullback_leibler(p, (p + q)/2)
|
88
|
+
b = self.kullback_leibler(q, (p + q)/2)
|
89
|
+
return (a + b)/2
|
90
|
+
|
91
|
+
|
92
|
+
def demo():
|
93
|
+
print("\nBegin Wasserstein distance demo ")
|
94
|
+
|
95
|
+
P = np.array([0.6, 0.1, 0.1, 0.1, 0.1])
|
96
|
+
Q1 = np.array([0.1, 0.1, 0.6, 0.1, 0.1])
|
97
|
+
Q2 = np.array([0.1, 0.1, 0.1, 0.1, 0.6])
|
98
|
+
|
99
|
+
P = torch.from_numpy(P)
|
100
|
+
Q1 = torch.from_numpy(Q1)
|
101
|
+
Q2 = torch.from_numpy(Q2)
|
102
|
+
kl_p_q1 = WassersteinDistance().compute_kullback(P, Q1)
|
103
|
+
kl_p_q2 = WassersteinDistance().compute_kullback(P, Q2)
|
104
|
+
|
105
|
+
wass_p_q1 = WassersteinDistance().compute_wasserstein(P, Q1)
|
106
|
+
wass_p_q2 = WassersteinDistance().compute_wasserstein(P, Q2)
|
107
|
+
|
108
|
+
jesson_p_q1 = WassersteinDistance().compute_jesson_shannon(P, Q1)
|
109
|
+
jesson_p_q2 = WassersteinDistance().compute_jesson_shannon(P, Q2)
|
110
|
+
|
111
|
+
print("\nKullback-Leibler distances: ")
|
112
|
+
print("P to Q1 : %0.4f " % kl_p_q1)
|
113
|
+
print("P to Q2 : %0.4f " % kl_p_q2)
|
114
|
+
|
115
|
+
print("\nWasserstein distances: ")
|
116
|
+
print("P to Q1 : %0.4f " % wass_p_q1)
|
117
|
+
print("P to Q2 : %0.4f " % wass_p_q2)
|
118
|
+
|
119
|
+
print("\nJesson-Shannon distances: ")
|
120
|
+
print("P to Q1 : %0.4f " % jesson_p_q1)
|
121
|
+
print("P to Q2 : %0.4f " % jesson_p_q2)
|
122
|
+
|
123
|
+
print("\nEnd demo ")
|
124
|
+
|
125
|
+
if __name__ == "__main__":
|
126
|
+
demo()
|
nltkor/etc.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
def parse_morph(target):
|
6
|
+
|
7
|
+
result=list()
|
8
|
+
buf = target
|
9
|
+
buf = re.sub(' ','',buf)
|
10
|
+
loop = re.search("(\/([A-Za-z]+)(?:\+|\n|$))", buf)
|
11
|
+
|
12
|
+
while loop:
|
13
|
+
pos = buf.find(loop.group(1))
|
14
|
+
spos = buf[:pos].rfind("\t") + 1
|
15
|
+
result.append((buf[spos:pos], loop.group(2)))
|
16
|
+
buf = buf[pos+len(loop.group(1)):]
|
17
|
+
loop = re.search("(\/([A-Za-z]+)(?:\+|\n|$))",buf)
|
18
|
+
return result
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
|