nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,569 @@
|
|
1
|
+
"""
|
2
|
+
string2string search
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
|
32
|
+
"""
|
33
|
+
This module contains the following algorithms:
|
34
|
+
(-) Naive search algorithm ++
|
35
|
+
(a) Rabin-Karp algorithm ++
|
36
|
+
(b) Boyer-Moore algorithm ++
|
37
|
+
(c) Knuth-Morris-Pratt algorithm
|
38
|
+
(d) Suffix Tree algorithm
|
39
|
+
(e) Suffix Array algorithm
|
40
|
+
(f) Suffix Automaton algorithm
|
41
|
+
(g) Aho-Corasick algorithm (basis of fgrep/grep in Unix) ++ (not implemented)
|
42
|
+
(h) Ukkonen's algorithm -- (not implemented)
|
43
|
+
(i) Wu-Manber algorithm ++ (not implemented)
|
44
|
+
(j) Z-Algorithm ++ (not implemented)
|
45
|
+
"""
|
46
|
+
|
47
|
+
from typing import List, Union, Tuple, Optional
|
48
|
+
|
49
|
+
# for dev purposes
|
50
|
+
import sys
|
51
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
|
52
|
+
from nltkor.misc.string2string_hash_functions import HashFunction, PolynomialRollingHash
|
53
|
+
# from string2string_hash_functions import HashFunction, PolynomialRollingHash
|
54
|
+
|
55
|
+
|
56
|
+
# Parent class for all search algorithms
|
57
|
+
class SearchAlgorithm:
|
58
|
+
"""
|
59
|
+
This class contains the parent class for all search algorithms.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(self) -> None:
|
63
|
+
"""
|
64
|
+
This function initializes the abstract class for all search algorithms.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
None
|
68
|
+
"""
|
69
|
+
pass
|
70
|
+
|
71
|
+
def search(self,
|
72
|
+
pattern: str,
|
73
|
+
text: str,
|
74
|
+
) -> int:
|
75
|
+
"""
|
76
|
+
Searches for the pattern in a text.
|
77
|
+
|
78
|
+
Arguments:
|
79
|
+
pattern (str): The pattern to search for.
|
80
|
+
text (str): The text to search in.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
int: The index of the pattern in the text.
|
84
|
+
"""
|
85
|
+
pass
|
86
|
+
|
87
|
+
|
88
|
+
class NaiveSearch(SearchAlgorithm):
|
89
|
+
"""
|
90
|
+
This class contains the naive search algorithm.
|
91
|
+
"""
|
92
|
+
|
93
|
+
def __init__(self) -> None:
|
94
|
+
"""
|
95
|
+
Initializes the class.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
None
|
99
|
+
"""
|
100
|
+
super().__init__()
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
def search(self,
|
105
|
+
pattern: str,
|
106
|
+
text: str,
|
107
|
+
) -> int:
|
108
|
+
"""
|
109
|
+
Searches for the pattern in the text.
|
110
|
+
|
111
|
+
Arguments:
|
112
|
+
text (str): The text to search in.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
int: The index of the pattern in the text (or -1 if the pattern is not found).
|
116
|
+
|
117
|
+
Raises:
|
118
|
+
AssertionError: If the inputs are invalid.
|
119
|
+
"""
|
120
|
+
# Check the inputs
|
121
|
+
assert isinstance(pattern, str), 'The pattern must be a string.'
|
122
|
+
assert isinstance(text, str), 'The text must be a string.'
|
123
|
+
|
124
|
+
# Set the attributes
|
125
|
+
self.pattern = pattern
|
126
|
+
self.pattern_length = len(self.pattern)
|
127
|
+
|
128
|
+
# Loop over the text
|
129
|
+
for i in range(len(text) - self.pattern_length + 1):
|
130
|
+
# Check if the strings match
|
131
|
+
if text[i:i + self.pattern_length] == self.pattern:
|
132
|
+
return i
|
133
|
+
|
134
|
+
# Return -1 if the pattern is not found
|
135
|
+
return -1
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
# Rabin-Karp search algorithm class
|
140
|
+
class RabinKarpSearch(SearchAlgorithm):
|
141
|
+
"""
|
142
|
+
This class contains the Rabin-Karp search algorithm.
|
143
|
+
"""
|
144
|
+
|
145
|
+
def __init__(self,
|
146
|
+
hash_function: HashFunction = PolynomialRollingHash(),
|
147
|
+
) -> None:
|
148
|
+
"""
|
149
|
+
This function initializes the Rabin-Karp search algorithm class, which uses a hash function to search for a pattern in a text. [RK1987]_
|
150
|
+
|
151
|
+
Arguments:
|
152
|
+
hash_function (HashFunction): The hash function to use.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
None
|
156
|
+
|
157
|
+
Raises:
|
158
|
+
AssertionError: If the inputs are invalid.
|
159
|
+
|
160
|
+
.. [RK1987] Karp, R.M. and Rabin, M.O., 1987. Efficient randomized pattern-matching algorithms. IBM Journal of Research and Development, 31(2), pp.249-260.
|
161
|
+
"""
|
162
|
+
assert isinstance(hash_function, HashFunction), 'The hash function must be a HashFunction object.'
|
163
|
+
|
164
|
+
# Set the attributes
|
165
|
+
# self.pattern = pattern
|
166
|
+
self.hash_function = hash_function
|
167
|
+
|
168
|
+
|
169
|
+
# # Compute the hash value of the pattern
|
170
|
+
# self.pattern_hash = self.hash_function.compute(self.pattern)
|
171
|
+
|
172
|
+
# # Length of the pattern
|
173
|
+
# self.pattern_length = len(self.pattern)
|
174
|
+
|
175
|
+
def itialize_pattern_hash(self,
|
176
|
+
pattern: str,
|
177
|
+
) -> None:
|
178
|
+
"""
|
179
|
+
This function initializes the pattern hash value.
|
180
|
+
|
181
|
+
Arguments:
|
182
|
+
pattern (str): The pattern to search for.
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
None
|
186
|
+
|
187
|
+
Raises:
|
188
|
+
AssertionError: If the inputs are invalid.
|
189
|
+
"""
|
190
|
+
# Check the inputs
|
191
|
+
assert isinstance(pattern, str), 'The pattern must be a string.'
|
192
|
+
|
193
|
+
# Reset the hash function
|
194
|
+
self.hash_function.reset()
|
195
|
+
|
196
|
+
# Set the attributes
|
197
|
+
self.pattern = pattern
|
198
|
+
|
199
|
+
# Compute the hash value of the pattern
|
200
|
+
self.pattern_hash = self.hash_function.compute(self.pattern)
|
201
|
+
|
202
|
+
# Length of the pattern
|
203
|
+
self.pattern_length = len(self.pattern)
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
def search(self,
|
208
|
+
pattern: str,
|
209
|
+
text: str,
|
210
|
+
) -> int:
|
211
|
+
"""
|
212
|
+
This function searches for the pattern in the text.
|
213
|
+
|
214
|
+
Arguments:
|
215
|
+
pattern (str): The pattern to search for.
|
216
|
+
text (str): The text to search in.
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
int: The index of the pattern in the text (or -1 if the pattern is not found).
|
220
|
+
|
221
|
+
Raises:
|
222
|
+
AssertionError: If the inputs are invalid.
|
223
|
+
|
224
|
+
|
225
|
+
"""
|
226
|
+
# Check the inputs
|
227
|
+
assert isinstance(text, str), 'The text must be a string.'
|
228
|
+
|
229
|
+
# Initialize the pattern hash
|
230
|
+
self.itialize_pattern_hash(pattern)
|
231
|
+
|
232
|
+
# Reset the hash function (in case it was used before) [Important!]
|
233
|
+
self.hash_function.reset()
|
234
|
+
|
235
|
+
# Compute the hash value of the first window
|
236
|
+
window_hash = self.hash_function.compute(text[:self.pattern_length])
|
237
|
+
|
238
|
+
# Loop over the text
|
239
|
+
for i in range(len(text) - self.pattern_length + 1):
|
240
|
+
# print('Window hash: {}'.format(window_hash))
|
241
|
+
|
242
|
+
# Check if the hash values match
|
243
|
+
if window_hash == self.pattern_hash:
|
244
|
+
# print('Hash values match at index {}.'.format(i))
|
245
|
+
j = 0
|
246
|
+
# Check if the strings match
|
247
|
+
while text[i + j] == self.pattern[j]:
|
248
|
+
j += 1
|
249
|
+
if j == self.pattern_length:
|
250
|
+
return i
|
251
|
+
# Update the hash value of the window
|
252
|
+
if i < len(text) - self.pattern_length:
|
253
|
+
window_hash = self.hash_function.update(text[i], text[i + self.pattern_length], self.pattern_length)
|
254
|
+
|
255
|
+
# Return -1 if the pattern is not found
|
256
|
+
return -1
|
257
|
+
|
258
|
+
|
259
|
+
|
260
|
+
# Knuth-Morris-Pratt (KMP) search algorithm class
|
261
|
+
class KMPSearch(SearchAlgorithm):
|
262
|
+
"""
|
263
|
+
This class contains the KMP search algorithm.
|
264
|
+
"""
|
265
|
+
|
266
|
+
def __init__(self) -> None:
|
267
|
+
r"""
|
268
|
+
This function initializes the Knuth-Morris-Pratt (KMP) search algorithm class. [KMP1977]_
|
269
|
+
|
270
|
+
Arguments:
|
271
|
+
None
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
None
|
275
|
+
|
276
|
+
.. note::
|
277
|
+
* The current version of the KMP algorithm utilizes an auxiliary list called the lps_array, which stands for "longest proper prefix which is also a suffix". The lps_array is a list of integers where lps_array[i] represents the length of the longest proper prefix of the pattern that is also a suffix of the pattern[:i+1].
|
278
|
+
* By precomputing the lps_array, the KMP algorithm avoids unnecessary character comparisons while searching for the pattern in the text. The algorithm scans the text from left to right and compares characters in the pattern with characters in the text. When a mismatch occurs, the algorithm uses the values in the lps_array to determine the next character in the pattern to compare with the text.
|
279
|
+
* An alternative implementation of the KMP algorithm exists, which uses a finite state automaton (FSA) instead of the lps_array, but this is not implemented in this version of the package.
|
280
|
+
|
281
|
+
.. [KMP1977] Knuth, D.E., Morris, J.H. and Pratt, V.R., 1977. Fast pattern matching in strings. SIAM journal on computing, 6(2), pp.323-350.
|
282
|
+
"""
|
283
|
+
super().__init__()
|
284
|
+
|
285
|
+
|
286
|
+
# Initialize_lps function
|
287
|
+
def initialize_lps(self) -> None:
|
288
|
+
r"""
|
289
|
+
This function initializes the pongest proper prefix suffix (lps) array, which contains the length of the longest proper prefix that is also a suffix of the pattern.
|
290
|
+
|
291
|
+
IOW: For each index i in the lps array, lps[i] is the length of the longest proper prefix that is also a suffix of the pattern[:i + 1]. In other words, if k = lps[i], then pattern[:k] is equal to pattern[i - k + 1:i + 1] (with the condition that pattern[:k+1] is not equal to pattern[i - k:i + 1]). The lps array is used in the Knuth-Morris-Pratt (KMP) algorithm to avoid unnecessary comparisons when searching for a pattern in a text.
|
292
|
+
|
293
|
+
Arguments:
|
294
|
+
pattern (str): The pattern to search for.
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
None
|
298
|
+
"""
|
299
|
+
# Initialize the list of longest proper prefix which is also a suffix
|
300
|
+
self.lps = [0] * self.pattern_length
|
301
|
+
|
302
|
+
# Loop over the pattern
|
303
|
+
i = 1 # denotes the index of the character in the pattern
|
304
|
+
j = 0 # denotes the length of the longest proper prefix which is also a suffix of the pattern[:i]
|
305
|
+
while i < self.pattern_length:
|
306
|
+
# Check if the characters match
|
307
|
+
if self.pattern[i] == self.pattern[j]:
|
308
|
+
j += 1
|
309
|
+
self.lps[i] = j
|
310
|
+
i += 1
|
311
|
+
else:
|
312
|
+
if j != 0:
|
313
|
+
j = self.lps[j - 1]
|
314
|
+
else:
|
315
|
+
self.lps[i] = 0
|
316
|
+
i += 1
|
317
|
+
|
318
|
+
|
319
|
+
# Search for the pattern in the text
|
320
|
+
def search(self,
|
321
|
+
pattern: str,
|
322
|
+
text: str,
|
323
|
+
) -> int:
|
324
|
+
"""
|
325
|
+
This function searches for the pattern in the text.
|
326
|
+
|
327
|
+
Arguments:
|
328
|
+
pattern (str): The pattern to search for.
|
329
|
+
text (str): The text to search in.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
int: The index of the pattern in the text (or -1 if the pattern is not found)
|
333
|
+
|
334
|
+
Raises:
|
335
|
+
AssertionError: If the text is not a string.
|
336
|
+
|
337
|
+
.. note::
|
338
|
+
* This is the main function of the KMP search algorithm class.
|
339
|
+
"""
|
340
|
+
# Check the inputs
|
341
|
+
assert isinstance(text, str), 'The text must be a string.'
|
342
|
+
|
343
|
+
# Set the attributes
|
344
|
+
self.pattern = pattern
|
345
|
+
self.pattern_length = len(self.pattern)
|
346
|
+
|
347
|
+
# Initialize the lps array
|
348
|
+
self.initialize_lps()
|
349
|
+
|
350
|
+
# Loop over the text
|
351
|
+
i = 0
|
352
|
+
j = 0
|
353
|
+
while i < len(text):
|
354
|
+
# Check if the characters match
|
355
|
+
if self.pattern[j] == text[i]:
|
356
|
+
i += 1
|
357
|
+
j += 1
|
358
|
+
# Check if the pattern is found
|
359
|
+
if j == self.pattern_length:
|
360
|
+
return i - j
|
361
|
+
# Check if the characters do not match
|
362
|
+
elif i < len(text) and self.pattern[j] != text[i]:
|
363
|
+
if j != 0:
|
364
|
+
j = self.lps[j - 1]
|
365
|
+
else:
|
366
|
+
i += 1
|
367
|
+
|
368
|
+
# Return -1 if the pattern is not found
|
369
|
+
return -1
|
370
|
+
|
371
|
+
|
372
|
+
|
373
|
+
|
374
|
+
# Boyer-Moore search algorithm class
|
375
|
+
class BoyerMooreSearch:
|
376
|
+
"""
|
377
|
+
This class contains the Boyer-Moore search algorithm.
|
378
|
+
"""
|
379
|
+
|
380
|
+
def __init__(self) -> None:
|
381
|
+
"""
|
382
|
+
This function initializes the Boyer-Moore search algorithm class. [BM1977]_
|
383
|
+
|
384
|
+
The Bayer-Moore search algorithm is a string searching algorithm that uses a heuristic to skip over large sections of the search string, resulting in faster search times than traditional algorithms such as brute-force or Knuth-Morris-Pratt. It is particularly useful for searching for patterns in large amounts of text.
|
385
|
+
|
386
|
+
.. [BM1977] Boyer, RS and Moore, JS. "A fast string searching algorithm." Communications of the ACM 20.10 (1977): 762-772.
|
387
|
+
|
388
|
+
A Correct Preprocessing Algorithm for Boyer–Moore String-Searching
|
389
|
+
|
390
|
+
https://www.cs.jhu.edu/~langmea/resources/lecture_notes/strings_matching_boyer_moore.pdf
|
391
|
+
|
392
|
+
"""
|
393
|
+
super().__init__()
|
394
|
+
|
395
|
+
|
396
|
+
|
397
|
+
# This is what we call the "prefix - suffix" match case of the good suffix rule
|
398
|
+
def aux_get_suffix_prefix_length(self,
|
399
|
+
i: int,
|
400
|
+
) -> int:
|
401
|
+
"""
|
402
|
+
This auxiliary function is used to compute the length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.
|
403
|
+
|
404
|
+
Arguments:
|
405
|
+
i (int): The index of the suffix.
|
406
|
+
|
407
|
+
Returns:
|
408
|
+
int: The length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.
|
409
|
+
"""
|
410
|
+
|
411
|
+
# pattern [ ....... i ................j]
|
412
|
+
# Initialize j to the end of the pattern
|
413
|
+
j = self.pattern_length - 1
|
414
|
+
|
415
|
+
# pattern [ ....... i ....... j .......]
|
416
|
+
# Move j to the left until we find a mismatch or until j == i
|
417
|
+
while j >= i and self.pattern[j] == self.pattern[j - i]:
|
418
|
+
# pattern [ ... j-i ..... i ... j .......]
|
419
|
+
j -= 1
|
420
|
+
|
421
|
+
return self.pattern_length - (j - 1)
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
# This is what we call the "substring match" case of the good suffix rule
|
426
|
+
def aux_get_matching_substring_length(self,
|
427
|
+
j: int,
|
428
|
+
) -> int:
|
429
|
+
"""
|
430
|
+
This auxilary function is used to compute the length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j.
|
431
|
+
|
432
|
+
It is used in the "substring match" case of the good suffix rule. More specifically, it is used to find when the suffix of the pattern does not match the text at all. Hence, we find the longest suffix of the pattern that matches a substring of the pattern that ends at the index j.
|
433
|
+
|
434
|
+
Arguments:
|
435
|
+
j (int): The end index of the substring.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
int: The length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j.
|
439
|
+
|
440
|
+
"""
|
441
|
+
# Loop over the suffixes of the pattern
|
442
|
+
for i in range(j, -1, -1):
|
443
|
+
# Check if the substring matches the suffix
|
444
|
+
if self.pattern[i:i+(j+1)] == self.pattern[self.pattern_length-(j+1):]:
|
445
|
+
return j - i + 1
|
446
|
+
# Otherwise, if we get here, the substring does not match any suffix of the pattern
|
447
|
+
return 0
|
448
|
+
|
449
|
+
|
450
|
+
|
451
|
+
# Creates the "good suffix" skip table
|
452
|
+
def create_skip_gs(self) -> None:
|
453
|
+
"""
|
454
|
+
This function creates the "good suffix" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)
|
455
|
+
|
456
|
+
Arguments:
|
457
|
+
None
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
None
|
461
|
+
|
462
|
+
"""
|
463
|
+
# Create the good suffix "skip" table
|
464
|
+
# TODO(msuzgun): Has an error!
|
465
|
+
self.skip_gs = [0] * self.pattern_length
|
466
|
+
# skip_gs[i] denotes the number of cells to the right we need to skip if the current character is the i-th character of the pattern
|
467
|
+
|
468
|
+
# First, we compute the length of the longest suffix of pattern [i:] that matches a prefix of the pattern
|
469
|
+
for i in range(self.pattern_length - 1):
|
470
|
+
self.skip_gs[i] = self.aux_get_suffix_prefix_length(i)
|
471
|
+
|
472
|
+
# Set the default skip value to the pattern length
|
473
|
+
self.skip_gs[-1] = 1
|
474
|
+
|
475
|
+
# Second, we compute the length of the longest suffix of the pattern that matches a substring of the pattern that ends at the index j
|
476
|
+
for j in range(self.pattern_length - 2):
|
477
|
+
k = (self.pattern_length - 1) - self.aux_get_matching_substring_length(j)
|
478
|
+
if self.skip_gs[k] == 0:
|
479
|
+
self.skip_gs[k] = self.pattern_length - 1 - j
|
480
|
+
|
481
|
+
|
482
|
+
|
483
|
+
# Creates the "bad character" skip table
|
484
|
+
def create_skip_bc(self) -> None:
|
485
|
+
"""
|
486
|
+
This function creates the "bad character" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)
|
487
|
+
|
488
|
+
Arguments:
|
489
|
+
None
|
490
|
+
|
491
|
+
Returns:
|
492
|
+
None
|
493
|
+
"""
|
494
|
+
# Create the bad character "skip" table
|
495
|
+
self.last_occurence = {}
|
496
|
+
|
497
|
+
# last_occurence[c] denotes the index of the last occurence of the character c in the pattern
|
498
|
+
for j in range(self.pattern_length - 1):
|
499
|
+
self.last_occurence[self.pattern[j]] = j
|
500
|
+
|
501
|
+
# Set the default skip value to the pattern length
|
502
|
+
self.last_occurence.setdefault(None, self.pattern_length)
|
503
|
+
|
504
|
+
|
505
|
+
|
506
|
+
# Searches for the pattern in the text using the Boyer-Moore algorithm
|
507
|
+
def search(self,
|
508
|
+
pattern: str,
|
509
|
+
text: str,
|
510
|
+
) -> int:
|
511
|
+
"""
|
512
|
+
This function searches for the pattern in the text using the Boyer-Moore algorithm.
|
513
|
+
|
514
|
+
Arguments:
|
515
|
+
pattern (str): The pattern to search for.
|
516
|
+
text (str): The text to search in.
|
517
|
+
|
518
|
+
Returns:
|
519
|
+
int: The index of the pattern in the text (or -1 if the pattern is not found)
|
520
|
+
|
521
|
+
Raises:
|
522
|
+
AssertionError: If the text or the pattern is not a string.
|
523
|
+
"""
|
524
|
+
# Check both the pattern and the text
|
525
|
+
assert isinstance(pattern, str), 'The pattern must be a string.'
|
526
|
+
assert isinstance(text, str), 'The text must be a string.'
|
527
|
+
|
528
|
+
# Set the attributes
|
529
|
+
self.pattern = pattern
|
530
|
+
|
531
|
+
# Length of the pattern
|
532
|
+
self.pattern_length = len(self.pattern)
|
533
|
+
|
534
|
+
# Preprocess the pattern by creating the skip tables for the bad character and good suffix rules, respectively.
|
535
|
+
self.create_skip_bc()
|
536
|
+
self.create_skip_gs()
|
537
|
+
|
538
|
+
|
539
|
+
# Loop over the text
|
540
|
+
i = 0
|
541
|
+
while i <= len(text) - self.pattern_length:
|
542
|
+
# Loop over the pattern
|
543
|
+
j = self.pattern_length - 1
|
544
|
+
while j >= 0 and text[i + j] == self.pattern[j]:
|
545
|
+
j -= 1
|
546
|
+
# Check if the pattern is found
|
547
|
+
if j < 0:
|
548
|
+
return i
|
549
|
+
# Update i
|
550
|
+
i += max(j - self.last_occurence.get(text[i + j], self.pattern_length), 1)
|
551
|
+
|
552
|
+
# Return -1 if the pattern is not found
|
553
|
+
return -1
|
554
|
+
|
555
|
+
def demo():
|
556
|
+
demo_sentence = '제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.'
|
557
|
+
demo_pattern = '옥포동'
|
558
|
+
naive = NaiveSearch().search(demo_pattern, demo_sentence)
|
559
|
+
rk = RabinKarpSearch().search(demo_pattern, demo_sentence)
|
560
|
+
kmp = KMPSearch().search(demo_pattern, demo_sentence)
|
561
|
+
bm = BoyerMooreSearch().search(demo_pattern, demo_sentence)
|
562
|
+
print('Naive search: {}'.format(naive))
|
563
|
+
print('Rabin-Karp search: {}'.format(rk))
|
564
|
+
print('KMP search: {}'.format(kmp))
|
565
|
+
print('Boyer-Moore search: {}'.format(bm))
|
566
|
+
|
567
|
+
|
568
|
+
if __name__ == '__main__':
|
569
|
+
demo()
|