nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,441 @@
1
+ # Import relevant libraries and dependencies
2
+ from typing import List, Union, Dict, Tuple
3
+ import numpy as np
4
+ from .wasserstein import WassersteinDistance
5
+
6
+ """
7
+ add string2string module code, src = https://github.com/stanfordnlp/string2string
8
+
9
+ MIT License
10
+
11
+ Copyright (c) 2023 Mirac Suzgun
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ """
31
+
32
+ # Parent class for all the string algorithms implemented in this module
33
+ class StringAlgs:
34
+ """
35
+ This class is the parent class for all the string algorithms implemented in this module.
36
+ """
37
+ # Initialize the class
38
+ def __init__(self,
39
+ match_weight: float = 0.0,
40
+ ) -> None:
41
+ # Set the match weight
42
+ self.match_weight = match_weight
43
+
44
+ # Levenshtein edit distance class
45
+ class LevenshteinEditDistance(StringAlgs):
46
+ def __init__(self,
47
+ match_weight: float = 0.0,
48
+ insert_weight: float = 1.0,
49
+ delete_weight: float = 1.0,
50
+ substitute_weight: float = 1.0,
51
+ ) -> None:
52
+ r"""
53
+ This class initializes the Levenshtein edit distance algorithm. Levenshtein edit distance represents the minimum number of edit distance operations (insertion, deletion, and substitution) required to convert one string to another.
54
+
55
+ The Levenshtein edit distance (with unit cost for each edit distance operation) is given by the following recurrence relation:
56
+
57
+ .. math::
58
+ :nowrap:
59
+
60
+ \begin{align}
61
+ d[i, j] := \min( & d[i-1, j-1] + \texttt{mismatch}(i, j), \\
62
+ & d[i-1, j] + 1, \\
63
+ & d[i, j-1] + 1),
64
+ \end{align}
65
+
66
+ where :math:`\texttt{mismatch}(i, j)` is 1 if the i-th element in str1 is not equal to the j-th element in str2, and 0 otherwise.
67
+
68
+ Arguments:
69
+ match_weight (float): The weight of a match (default: 0.0).
70
+ insert_weight (float): The weight of an insertion (default: 1.0).
71
+ delete_weight (float): The weight of a deletion (default: 1.0).
72
+ substitute_weight (float): The weight of a substitution (default: 1.0).
73
+
74
+ Raises:
75
+ AssertionError: If any of the weights are negative.
76
+ """
77
+ # Set the match weight
78
+ super().__init__(match_weight=match_weight)
79
+
80
+ # Set the insert, delete, and substite weights
81
+ self.insert_weight = insert_weight
82
+ self.delete_weight = delete_weight
83
+ self.substitute_weight = substitute_weight
84
+
85
+ # Assert that all the weights are non-negative
86
+ assert min(match_weight, insert_weight, delete_weight, substitute_weight) >= 0.0
87
+
88
+
89
+
90
+ # Compute the Levenshtein edit distance between two strings using recursion
91
+ def compute_recursive(self,
92
+ str1: Union[str, List[str]],
93
+ str2: Union[str, List[str]],
94
+ ) -> float:
95
+ r"""
96
+ This function computes the Levenshtein edit distance between two strings (or lists of strings) using recursion.
97
+
98
+ Arguments:
99
+ str1 (str or list of str): The first string (or list of strings).
100
+ str2 (str or list of str): The second string (or list of strings).
101
+
102
+ Returns:
103
+ The Levenshtein edit distance between the two strings.
104
+
105
+ .. note::
106
+ * The solution presented here utilizes recursion to compute the Levenshtein edit distance between two strings. It has an exponential time complexity and is not recommended for pairs of strings with a large length.
107
+ * The time complexity of this function is :math:`O(3^{m+n})`, where :math:`m` and :math:`n` are the lengths of the two strings.
108
+ """
109
+ # Base case
110
+ if len(str1) == 0:
111
+ return len(str2) * self.insert_weight
112
+ elif len(str2) == 0:
113
+ return len(str1) * self.delete_weight
114
+
115
+ # Compute the mismatch
116
+ mismatch = 0.0 if str1[-1] == str2[-1] else self.substitute_weight
117
+
118
+ # Compute the Levenshtein edit distance
119
+ return min(
120
+ self.compute_recursive(str1[:-1], str2[:-1]) + mismatch,
121
+ self.compute_recursive(str1[:-1], str2) + self.delete_weight,
122
+ self.compute_recursive(str1, str2[:-1]) + self.insert_weight,
123
+ )
124
+
125
+
126
+
127
+ # Compute the Levenshtein edit distance between two strings using memoization
128
+ def compute_recursive_memoization(self,
129
+ str1: Union[str, List[str]],
130
+ str2: Union[str, List[str]],
131
+ ) -> float:
132
+ r"""
133
+ This function computes the Levenshtein edit distance between two strings (or lists of strings) using memoization.
134
+
135
+ Arguments:
136
+ str1 (str or list of str): The first string (or list of strings).
137
+ str2 (str or list of str): The second string (or list of strings).
138
+
139
+ Returns:
140
+ The Levenshtein edit distance between the two strings.
141
+
142
+ .. note::
143
+ * The solution presented here utilizes memoization to compute the Levenshtein edit distance between two strings.
144
+ * The time complexity of this function is :math:`\mathcal{O}(m n)`, where :math:`m` and :math:`n` are the lengths of the two strings.
145
+ """
146
+ # Initialize the memoization dictionary
147
+ memoization = {}
148
+
149
+ # Compute the Levenshtein edit distance
150
+ return self.compute_memoization_helper(str1, str2, memoization)
151
+
152
+
153
+
154
+ # Compute the Levenshtein edit distance between two strings using memoization (helper function)
155
+ def compute_memoization_helper(self,
156
+ str1: Union[str, List[str]],
157
+ str2: Union[str, List[str]],
158
+ memoization: Dict[Tuple[str, str], float],
159
+ ) -> float:
160
+ r"""
161
+ This is a helper function that computes the Levenshtein edit distance between two strings (or lists of strings) using memoization.
162
+
163
+ Arguments:
164
+ str1 (str or list of str): The first string (or list of strings).
165
+ str2 (str or list of str): The second string (or list of strings).
166
+ memoization (dict): The memoization dictionary.
167
+
168
+ Returns:
169
+ The Levenshtein edit distance between the two strings.
170
+
171
+ .. note::
172
+ * The solution presented here utilizes memoization to compute the Levenshtein edit distance between two strings.
173
+ * One can also use the :func:`functools.lru_cache` (@lru_cache()) decorator to memoize the function calls. However, for the sake of educational purposes, we have implemented memoization using a dictionary.
174
+ * The time complexity of this function is quadratic, that is :math:`\mathcal{O}(nm)`, where m and n are the lengths of the two strings.
175
+ """
176
+ # Base case
177
+ if len(str1) == 0:
178
+ return len(str2) * self.insert_weight
179
+ elif len(str2) == 0:
180
+ return len(str1) * self.delete_weight
181
+
182
+ # Check if the Levenshtein edit distance has already been computed
183
+ if (str1, str2) in memoization:
184
+ return memoization[(str1, str2)]
185
+
186
+ # Compute the mismatch
187
+ mismatch = 0.0 if str1[-1] == str2[-1] else self.substitute_weight
188
+
189
+ # Compute the Levenshtein edit distance
190
+ memoization[(str1, str2)] = min(
191
+ self.compute_memoization_helper(str1[:-1], str2[:-1], memoization) + mismatch,
192
+ self.compute_memoization_helper(str1[:-1], str2, memoization) + self.delete_weight,
193
+ self.compute_memoization_helper(str1, str2[:-1], memoization) + self.insert_weight,
194
+ )
195
+
196
+ # Return the Levenshtein edit distance
197
+ return memoization[(str1, str2)]
198
+
199
+
200
+
201
+ # Compute the Levenshtein edit distance between two strings using dynamic programming
202
+ def compute_dynamic_programming(self,
203
+ str1: Union[str, List[str]],
204
+ str2: Union[str, List[str]],
205
+ ) -> float:
206
+ r"""
207
+ This function computes the Levenshtein edit distance between two strings (or lists of strings) using dynamic programming (Wagner-Fischer algorithm).
208
+
209
+ Arguments:
210
+ str1 (str or list of str): The first string (or list of strings).
211
+ str2 (str or list of str): The second string (or list of strings).
212
+
213
+ Returns:
214
+ The Levenshtein edit distance between the two strings.
215
+
216
+ .. note::
217
+ * The solution presented here utilizes dynamic programming principles to compute the Levenshtein edit distance between two strings.
218
+ * This solution is also known as the Wagner-Fischer algorithm. [WF1974]_
219
+ * The time complexity of this dynamic-programming-based solution is :math:`\mathcal{O}(nm)`, and the space complexity is :math:`\mathcal{O}(nm)`, where n and m are the lengths of the two strings, respectively.
220
+ * However, by using only two rows of the distance matrix at a time, the space complexity of the dynamic programming solution can be reduced to :math:`\mathcal{O}(min(n, m))`.
221
+ * The time complexity cannot be made strongly subquadratic time unless SETH is false. [BI2015]_
222
+ * Finally, we note that this solution can be extended to cases where each edit distance operation has a non-unit cost.
223
+
224
+ .. [WF1974] Wagner, R.A. and Fischer, M.J., 1974. The string-to-string correction problem. Journal of the ACM (JACM), 21(1), pp.168-173.
225
+ .. [BI2015] Backurs, A. and Indyk, P., 2015, June. Edit distance cannot be computed in strongly subquadratic time (unless SETH is false). In Proceedings of the forty-seventh annual ACM symposium on Theory of computing (pp. 51-58).
226
+ """
227
+ # Lengths of strings str1 and str2, respectively.
228
+ n = len(str1)
229
+ m = len(str2)
230
+
231
+ # Initialize the distance matrix.
232
+ dist = np.zeros((n + 1, m + 1))
233
+ for i in range(1, n + 1):
234
+ dist[i, 0] = self.delete_weight * i
235
+ for j in range(1, m + 1):
236
+ dist[0, j] = self.insert_weight * j
237
+
238
+ # Dynamic programming step, where each operation has a unit cost:
239
+ # d[i, j] := min(d[i-1, j-1] + mismatch(i, j), d[i-1, j] + 1, d[i, j-1] + 1),
240
+ # where mismatch(i, j) is 1 if str1[i] != str2[j] and 0 otherwise.
241
+ for i in range(1, n + 1):
242
+ for j in range(1, m + 1):
243
+ # Compute the minimum edit distance between str1[:i] and str2[:j].
244
+ dist[i, j] = min(
245
+ dist[i-1, j-1] + (self.substitute_weight if str1[i-1] != str2[j-1] else self.match_weight),
246
+ dist[i-1, j] + self.delete_weight,
247
+ dist[i, j-1] + self.insert_weight,
248
+ )
249
+
250
+ # Return the Levenshtein edit distance between str1 and str2.
251
+ return dist[n, m]
252
+
253
+
254
+
255
+ # Compute the Levenshtein edit distance between two strings
256
+ def compute(self,
257
+ str1: Union[str, List[str]],
258
+ str2: Union[str, List[str]],
259
+ method: str = "dynamic-programming",
260
+ ) -> float:
261
+ r"""
262
+ This function computes the Levenshtein edit distance between two strings (or lists of strings), using the method specified by the user.
263
+
264
+ Arguments:
265
+ str1 (str or list of str): The first string (or list of strings).
266
+ str2 (str or list of str): The second string (or list of strings).
267
+ method (str): The method to use to compute the Levenshtein edit distance (default: "dynamic-programming").
268
+
269
+ Returns:
270
+ The Levenshtein edit distance between the two strings.
271
+
272
+ .. note::
273
+ * The method can be one of the following:
274
+ * "recursive": This method computes the Levenshtein edit distance using recursion.
275
+ * "recursive-memoization": This method computes the Levenshtein edit distance using recursion with memoization.
276
+ * "dynamic-programming": This method computes the Levenshtein edit distance using dynamic programming (Wagner-Fischer algorithm).
277
+ * By default, the method is "dynamic-programming".
278
+
279
+ """
280
+ # If the method is dynamic programming, then compute the Levenshtein edit distance using dynamic programming
281
+ if method == "recursive":
282
+ return self.compute_recursive(str1, str2)
283
+ elif method == "recursive-memoization":
284
+ return self.compute_recursive_memoization(str1, str2)
285
+ return self.compute_dynamic_programming(str1, str2)
286
+
287
+ # Hamming (edit) distance class
288
+ class HammingDistance(StringAlgs):
289
+ def __init__(self,
290
+ match_weight: float = 0.0,
291
+ substitute_weight: float = 1.0,
292
+ ) -> None:
293
+ r"""
294
+ This function initializes the class variables of the Hamming distance.
295
+
296
+ The Hamming distance is the number of positions at which the corresponding symbols are different. [H1950]_
297
+
298
+ Arguments:
299
+ match_weight (float): The weight of a match (default: 0.0).
300
+ substitute_weight (float): The weight of a substitution (default: 1.0).
301
+
302
+ Raises:
303
+ AssertionError: If the substite weight is negative.
304
+
305
+ .. note::
306
+ * The Hamming distance has a time complexity of :math:`\mathcal{O}(n)`, where :math: `n` the length of the two strings.
307
+
308
+ .. [H1950] Hamming, R.W., 1968. Error detecting and error correcting codes. Bell System Technical Journal, 29(2), pp.147-160.
309
+ """
310
+ # Set the match weight
311
+ super().__init__(match_weight=match_weight)
312
+
313
+ # Set the substite weight
314
+ self.substitute_weight = substitute_weight
315
+
316
+ # Assert that the substite weight is non-negative
317
+ assert substitute_weight >= 0.0
318
+
319
+
320
+
321
+ # Compute the Hamming distance between two strings
322
+ def compute(self,
323
+ str1: Union[str, List[str]],
324
+ str2: Union[str, List[str]],
325
+ ) -> float:
326
+ """
327
+ This function computes the Hamming distance between two strings (or lists of strings).
328
+
329
+ Arguments:
330
+ str1 (str or list of str): The first string (or list of strings).
331
+ str2 (str or list of str): The second string (or list of strings).
332
+
333
+ Returns:
334
+ The Hamming distance between the two strings.
335
+
336
+ Raises:
337
+ ValueError: If the two strings (or lists of strings) have different lengths.
338
+ """
339
+
340
+ # Lengths of strings str1 and str2, respectively.
341
+ n = len(str1)
342
+ m = len(str2)
343
+
344
+ # Assert that the two strings have the same length
345
+ if n != m:
346
+ raise ValueError("The two strings (or lists of strings) must have the same length.")
347
+
348
+ # Compute the Hamming edit distance between str1 and str2.
349
+ return sum(
350
+ self.substitute_weight if str1[i] != str2[i] else self.match_weight
351
+ for i in range(n)
352
+ )
353
+
354
+
355
+ # Damerau-Levenshtein edit distance class
356
+ class DamerauLevenshteinDistance(LevenshteinEditDistance):
357
+ def __init__(self,
358
+ match_weight: float = 0.0,
359
+ insert_weight: float = 1.0,
360
+ delete_weight: float = 1.0,
361
+ substitute_weight: float = 1.0,
362
+ adjacent_transpose_weight: float = 1.0,
363
+ ) -> None:
364
+ r"""
365
+ This function initializes the class variables of the Damerau-Levenshtein distance.
366
+
367
+ The Damerau-Levenshtein distance is the minimum number of insertions, deletions, substitutions, and transpositions required to transform one string into the other. [D1964]_
368
+
369
+ Arguments:
370
+ match_weight (float): The weight of a match (default: 0.0).
371
+ insert_weight (float): The weight of an insertion (default: 1.0).
372
+ delete_weight (float): The weight of a deletion (default: 1.0).
373
+ substitute_weight (float): The weight of a substitution (default: 1.0).
374
+ adjacent_transpose_weight (float): The weight of an adjacent transposition (default: 1.0).
375
+
376
+ Raises:
377
+ AssertionError: If the insert, delete, substite, or adjacent transpose weights are negative.
378
+
379
+ .. [D1964] Damerau, F.J., 1964. A technique for computer detection and correction of spelling errors. Communications of the ACM, 7(3), pp.171-176.
380
+ """
381
+ # Set the weights of the distance operations
382
+ super().__init__(
383
+ match_weight=match_weight,
384
+ insert_weight=insert_weight,
385
+ delete_weight=delete_weight,
386
+ substitute_weight=substitute_weight,
387
+ )
388
+
389
+ # Set the adjacent transpose weight
390
+ self.adjacent_transpose_weight = adjacent_transpose_weight
391
+
392
+ # Assert that the adjacent transpose weight is non-negative
393
+ assert adjacent_transpose_weight >= 0.0
394
+
395
+
396
+
397
+ # Compute the Damerau-Levenshtein edit distance between two strings
398
+ def compute(self,
399
+ str1: Union[str, List[str]],
400
+ str2: Union[str, List[str]],
401
+ ) -> float:
402
+ """
403
+ This function computes the Damerau-Levenshtein edit distance between two strings (or lists of strings).
404
+
405
+ Arguments:
406
+ str1 (str or list of str): The first string (or list of strings).
407
+ str2 (str or list of str): The second string (or list of strings).
408
+
409
+ Returns:
410
+ The Damerau-Levenshtein distance between the two strings.
411
+
412
+ .. note::
413
+ * The Damerau-Levenshtein distance is a variant of the Levenshtein distance that allows for adjacent transpositions.
414
+ * The dynamic programming solution to the Damerau-Levenshtein distance has a time complexity of :math:`\mathcal{O}(nm)`, where n and m are the lengths of the two strings.
415
+ """
416
+
417
+ # Lengths of strings str1 and str2, respectively.
418
+ n = len(str1)
419
+ m = len(str2)
420
+
421
+ # Initialize the distance matrix.
422
+ dist = np.zeros((n + 1, m + 1))
423
+ for i in range(1, n + 1):
424
+ dist[i, 0] = self.delete_weight * i
425
+ for j in range(1, m + 1):
426
+ dist[0, j] = self.insert_weight * j
427
+
428
+ # Dynamic programming solution to the Damerau-Levenshtein edit distance is very similar to that of the Levenshtein edit distance.
429
+ for i in range(1, n + 1):
430
+ for j in range(1, m + 1):
431
+ dist[i, j] = min(
432
+ dist[i-1, j-1] + (self.substitute_weight if str1[i-1] != str2[j-1] else self.match_weight),
433
+ dist[i-1, j] + self.delete_weight,
434
+ dist[i, j-1] + self.insert_weight,
435
+ )
436
+ # This is the only difference between the Damerau-Levenshtein edit distance and the Levenshtein edit distance.
437
+ if i > 1 and j > 1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]:
438
+ dist[i, j] = min(dist[i, j], dist[i-2, j-2] + self.adjacent_transpose_weight)
439
+
440
+ # Return the Damerau-Levenshtein edit distance between str1 and str2.
441
+ return dist[n, m]
@@ -0,0 +1,126 @@
1
+ import numpy as np
2
+
3
+ """
4
+ source code = https://visualstudiomagazine.com/articles/2021/08/16/wasserstein-distance.aspx
5
+ By James McCaffrey
6
+ """
7
+
8
+ from nltkor.make_requirement import make_requirement
9
+ try:
10
+ import torch
11
+ except ImportError:
12
+ requirement = ['torch']
13
+ file_path = make_requirement(requirement)
14
+ raise Exception(f"""
15
+ Need to install Libraries, please pip install below libraries
16
+ \t pip install torch
17
+ Or, use pip install requirement.txt
18
+ \t pip install -r {file_path}
19
+ """)
20
+
21
+
22
+ class WassersteinDistance:
23
+ def __init__(self) -> None:
24
+ pass
25
+
26
+ def first_nonzero(self, vec):
27
+ dim = len(vec)
28
+ for i in range(dim):
29
+ if vec[i] > 0.0:
30
+ return i
31
+ return -1 # no empty cells found
32
+
33
+ def move_dirt(self, dirt, di, holes, hi):
34
+ # move as much dirt at [di] as possible to h[hi]
35
+ if dirt[di] <= holes[hi]: # use all dirt
36
+ flow = dirt[di]
37
+ dirt[di] = 0.0 # all dirt got moved
38
+ holes[hi] -= flow # less to fill now
39
+ elif dirt[di] > holes[hi]: # use just part of dirt
40
+ flow = holes[hi] # fill remainder of hole
41
+ dirt[di] -= flow # less dirt left
42
+ holes[hi] = 0.0 # hole is filled
43
+ dist = np.abs(di - hi)
44
+ return flow * dist # work
45
+
46
+ def compute_wasserstein(self, p, q):
47
+ if "torch" in str(type(p)):
48
+ p = p.numpy()
49
+ if "torch" in str(type(q)):
50
+ q = q.numpy()
51
+
52
+ dirt = np.copy(p)
53
+ holes = np.copy(q)
54
+ tot_work = 0.0
55
+
56
+ while True: # TODO: add sanity counter check
57
+ from_idx = self.first_nonzero(dirt)
58
+ to_idx = self.first_nonzero(holes)
59
+ if from_idx == -1 or to_idx == -1:
60
+ break
61
+ work = self.move_dirt(dirt, from_idx, holes, to_idx)
62
+ tot_work += work
63
+ return tot_work
64
+
65
+ def kullback_leibler(self, p, q):
66
+ n = len(p)
67
+ sum = 0.0
68
+ for i in range(n):
69
+ sum += p[i] * np.log(p[i] / q[i])
70
+ return sum
71
+
72
+ def compute_kullback(self, p, q):
73
+ if "torch" in str(type(p)):
74
+ p = p.numpy()
75
+ if "torch" in str(type(q)):
76
+ q = q.numpy()
77
+ a = self.kullback_leibler(p, q)
78
+ b = self.kullback_leibler(q, p)
79
+ return a + b
80
+
81
+ def compute_jesson_shannon(self, p, q):
82
+ if "torch" in str(type(p)):
83
+ p = p.numpy()
84
+ if "torch" in str(type(q)):
85
+ q = q.numpy()
86
+
87
+ a = self.kullback_leibler(p, (p + q)/2)
88
+ b = self.kullback_leibler(q, (p + q)/2)
89
+ return (a + b)/2
90
+
91
+
92
+ def demo():
93
+ print("\nBegin Wasserstein distance demo ")
94
+
95
+ P = np.array([0.6, 0.1, 0.1, 0.1, 0.1])
96
+ Q1 = np.array([0.1, 0.1, 0.6, 0.1, 0.1])
97
+ Q2 = np.array([0.1, 0.1, 0.1, 0.1, 0.6])
98
+
99
+ P = torch.from_numpy(P)
100
+ Q1 = torch.from_numpy(Q1)
101
+ Q2 = torch.from_numpy(Q2)
102
+ kl_p_q1 = WassersteinDistance().compute_kullback(P, Q1)
103
+ kl_p_q2 = WassersteinDistance().compute_kullback(P, Q2)
104
+
105
+ wass_p_q1 = WassersteinDistance().compute_wasserstein(P, Q1)
106
+ wass_p_q2 = WassersteinDistance().compute_wasserstein(P, Q2)
107
+
108
+ jesson_p_q1 = WassersteinDistance().compute_jesson_shannon(P, Q1)
109
+ jesson_p_q2 = WassersteinDistance().compute_jesson_shannon(P, Q2)
110
+
111
+ print("\nKullback-Leibler distances: ")
112
+ print("P to Q1 : %0.4f " % kl_p_q1)
113
+ print("P to Q2 : %0.4f " % kl_p_q2)
114
+
115
+ print("\nWasserstein distances: ")
116
+ print("P to Q1 : %0.4f " % wass_p_q1)
117
+ print("P to Q2 : %0.4f " % wass_p_q2)
118
+
119
+ print("\nJesson-Shannon distances: ")
120
+ print("P to Q1 : %0.4f " % jesson_p_q1)
121
+ print("P to Q2 : %0.4f " % jesson_p_q2)
122
+
123
+ print("\nEnd demo ")
124
+
125
+ if __name__ == "__main__":
126
+ demo()
nltkor/etc.py ADDED
@@ -0,0 +1,22 @@
1
+ import re
2
+
3
+
4
+
5
+ def parse_morph(target):
6
+
7
+ result=list()
8
+ buf = target
9
+ buf = re.sub(' ','',buf)
10
+ loop = re.search("(\/([A-Za-z]+)(?:\+|\n|$))", buf)
11
+
12
+ while loop:
13
+ pos = buf.find(loop.group(1))
14
+ spos = buf[:pos].rfind("\t") + 1
15
+ result.append((buf[spos:pos], loop.group(2)))
16
+ buf = buf[pos+len(loop.group(1)):]
17
+ loop = re.search("(\/([A-Za-z]+)(?:\+|\n|$))",buf)
18
+ return result
19
+
20
+
21
+
22
+