nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,569 @@
1
+ """
2
+ string2string search
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This module contains the following algorithms:
34
+ (-) Naive search algorithm ++
35
+ (a) Rabin-Karp algorithm ++
36
+ (b) Boyer-Moore algorithm ++
37
+ (c) Knuth-Morris-Pratt algorithm
38
+ (d) Suffix Tree algorithm
39
+ (e) Suffix Array algorithm
40
+ (f) Suffix Automaton algorithm
41
+ (g) Aho-Corasick algorithm (basis of fgrep/grep in Unix) ++ (not implemented)
42
+ (h) Ukkonen's algorithm -- (not implemented)
43
+ (i) Wu-Manber algorithm ++ (not implemented)
44
+ (j) Z-Algorithm ++ (not implemented)
45
+ """
46
+
47
+ from typing import List, Union, Tuple, Optional
48
+
49
+ # for dev purposes
50
+ import sys
51
+ # sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
52
+ from nltkor.misc.string2string_hash_functions import HashFunction, PolynomialRollingHash
53
+ # from string2string_hash_functions import HashFunction, PolynomialRollingHash
54
+
55
+
56
+ # Parent class for all search algorithms
57
+ class SearchAlgorithm:
58
+ """
59
+ This class contains the parent class for all search algorithms.
60
+ """
61
+
62
+ def __init__(self) -> None:
63
+ """
64
+ This function initializes the abstract class for all search algorithms.
65
+
66
+ Returns:
67
+ None
68
+ """
69
+ pass
70
+
71
+ def search(self,
72
+ pattern: str,
73
+ text: str,
74
+ ) -> int:
75
+ """
76
+ Searches for the pattern in a text.
77
+
78
+ Arguments:
79
+ pattern (str): The pattern to search for.
80
+ text (str): The text to search in.
81
+
82
+ Returns:
83
+ int: The index of the pattern in the text.
84
+ """
85
+ pass
86
+
87
+
88
+ class NaiveSearch(SearchAlgorithm):
89
+ """
90
+ This class contains the naive search algorithm.
91
+ """
92
+
93
+ def __init__(self) -> None:
94
+ """
95
+ Initializes the class.
96
+
97
+ Returns:
98
+ None
99
+ """
100
+ super().__init__()
101
+
102
+
103
+
104
+ def search(self,
105
+ pattern: str,
106
+ text: str,
107
+ ) -> int:
108
+ """
109
+ Searches for the pattern in the text.
110
+
111
+ Arguments:
112
+ text (str): The text to search in.
113
+
114
+ Returns:
115
+ int: The index of the pattern in the text (or -1 if the pattern is not found).
116
+
117
+ Raises:
118
+ AssertionError: If the inputs are invalid.
119
+ """
120
+ # Check the inputs
121
+ assert isinstance(pattern, str), 'The pattern must be a string.'
122
+ assert isinstance(text, str), 'The text must be a string.'
123
+
124
+ # Set the attributes
125
+ self.pattern = pattern
126
+ self.pattern_length = len(self.pattern)
127
+
128
+ # Loop over the text
129
+ for i in range(len(text) - self.pattern_length + 1):
130
+ # Check if the strings match
131
+ if text[i:i + self.pattern_length] == self.pattern:
132
+ return i
133
+
134
+ # Return -1 if the pattern is not found
135
+ return -1
136
+
137
+
138
+
139
+ # Rabin-Karp search algorithm class
140
+ class RabinKarpSearch(SearchAlgorithm):
141
+ """
142
+ This class contains the Rabin-Karp search algorithm.
143
+ """
144
+
145
+ def __init__(self,
146
+ hash_function: HashFunction = PolynomialRollingHash(),
147
+ ) -> None:
148
+ """
149
+ This function initializes the Rabin-Karp search algorithm class, which uses a hash function to search for a pattern in a text. [RK1987]_
150
+
151
+ Arguments:
152
+ hash_function (HashFunction): The hash function to use.
153
+
154
+ Returns:
155
+ None
156
+
157
+ Raises:
158
+ AssertionError: If the inputs are invalid.
159
+
160
+ .. [RK1987] Karp, R.M. and Rabin, M.O., 1987. Efficient randomized pattern-matching algorithms. IBM Journal of Research and Development, 31(2), pp.249-260.
161
+ """
162
+ assert isinstance(hash_function, HashFunction), 'The hash function must be a HashFunction object.'
163
+
164
+ # Set the attributes
165
+ # self.pattern = pattern
166
+ self.hash_function = hash_function
167
+
168
+
169
+ # # Compute the hash value of the pattern
170
+ # self.pattern_hash = self.hash_function.compute(self.pattern)
171
+
172
+ # # Length of the pattern
173
+ # self.pattern_length = len(self.pattern)
174
+
175
+ def itialize_pattern_hash(self,
176
+ pattern: str,
177
+ ) -> None:
178
+ """
179
+ This function initializes the pattern hash value.
180
+
181
+ Arguments:
182
+ pattern (str): The pattern to search for.
183
+
184
+ Returns:
185
+ None
186
+
187
+ Raises:
188
+ AssertionError: If the inputs are invalid.
189
+ """
190
+ # Check the inputs
191
+ assert isinstance(pattern, str), 'The pattern must be a string.'
192
+
193
+ # Reset the hash function
194
+ self.hash_function.reset()
195
+
196
+ # Set the attributes
197
+ self.pattern = pattern
198
+
199
+ # Compute the hash value of the pattern
200
+ self.pattern_hash = self.hash_function.compute(self.pattern)
201
+
202
+ # Length of the pattern
203
+ self.pattern_length = len(self.pattern)
204
+
205
+
206
+
207
+ def search(self,
208
+ pattern: str,
209
+ text: str,
210
+ ) -> int:
211
+ """
212
+ This function searches for the pattern in the text.
213
+
214
+ Arguments:
215
+ pattern (str): The pattern to search for.
216
+ text (str): The text to search in.
217
+
218
+ Returns:
219
+ int: The index of the pattern in the text (or -1 if the pattern is not found).
220
+
221
+ Raises:
222
+ AssertionError: If the inputs are invalid.
223
+
224
+
225
+ """
226
+ # Check the inputs
227
+ assert isinstance(text, str), 'The text must be a string.'
228
+
229
+ # Initialize the pattern hash
230
+ self.itialize_pattern_hash(pattern)
231
+
232
+ # Reset the hash function (in case it was used before) [Important!]
233
+ self.hash_function.reset()
234
+
235
+ # Compute the hash value of the first window
236
+ window_hash = self.hash_function.compute(text[:self.pattern_length])
237
+
238
+ # Loop over the text
239
+ for i in range(len(text) - self.pattern_length + 1):
240
+ # print('Window hash: {}'.format(window_hash))
241
+
242
+ # Check if the hash values match
243
+ if window_hash == self.pattern_hash:
244
+ # print('Hash values match at index {}.'.format(i))
245
+ j = 0
246
+ # Check if the strings match
247
+ while text[i + j] == self.pattern[j]:
248
+ j += 1
249
+ if j == self.pattern_length:
250
+ return i
251
+ # Update the hash value of the window
252
+ if i < len(text) - self.pattern_length:
253
+ window_hash = self.hash_function.update(text[i], text[i + self.pattern_length], self.pattern_length)
254
+
255
+ # Return -1 if the pattern is not found
256
+ return -1
257
+
258
+
259
+
260
+ # Knuth-Morris-Pratt (KMP) search algorithm class
261
+ class KMPSearch(SearchAlgorithm):
262
+ """
263
+ This class contains the KMP search algorithm.
264
+ """
265
+
266
+ def __init__(self) -> None:
267
+ r"""
268
+ This function initializes the Knuth-Morris-Pratt (KMP) search algorithm class. [KMP1977]_
269
+
270
+ Arguments:
271
+ None
272
+
273
+ Returns:
274
+ None
275
+
276
+ .. note::
277
+ * The current version of the KMP algorithm utilizes an auxiliary list called the lps_array, which stands for "longest proper prefix which is also a suffix". The lps_array is a list of integers where lps_array[i] represents the length of the longest proper prefix of the pattern that is also a suffix of the pattern[:i+1].
278
+ * By precomputing the lps_array, the KMP algorithm avoids unnecessary character comparisons while searching for the pattern in the text. The algorithm scans the text from left to right and compares characters in the pattern with characters in the text. When a mismatch occurs, the algorithm uses the values in the lps_array to determine the next character in the pattern to compare with the text.
279
+ * An alternative implementation of the KMP algorithm exists, which uses a finite state automaton (FSA) instead of the lps_array, but this is not implemented in this version of the package.
280
+
281
+ .. [KMP1977] Knuth, D.E., Morris, J.H. and Pratt, V.R., 1977. Fast pattern matching in strings. SIAM journal on computing, 6(2), pp.323-350.
282
+ """
283
+ super().__init__()
284
+
285
+
286
+ # Initialize_lps function
287
+ def initialize_lps(self) -> None:
288
+ r"""
289
+ This function initializes the pongest proper prefix suffix (lps) array, which contains the length of the longest proper prefix that is also a suffix of the pattern.
290
+
291
+ IOW: For each index i in the lps array, lps[i] is the length of the longest proper prefix that is also a suffix of the pattern[:i + 1]. In other words, if k = lps[i], then pattern[:k] is equal to pattern[i - k + 1:i + 1] (with the condition that pattern[:k+1] is not equal to pattern[i - k:i + 1]). The lps array is used in the Knuth-Morris-Pratt (KMP) algorithm to avoid unnecessary comparisons when searching for a pattern in a text.
292
+
293
+ Arguments:
294
+ pattern (str): The pattern to search for.
295
+
296
+ Returns:
297
+ None
298
+ """
299
+ # Initialize the list of longest proper prefix which is also a suffix
300
+ self.lps = [0] * self.pattern_length
301
+
302
+ # Loop over the pattern
303
+ i = 1 # denotes the index of the character in the pattern
304
+ j = 0 # denotes the length of the longest proper prefix which is also a suffix of the pattern[:i]
305
+ while i < self.pattern_length:
306
+ # Check if the characters match
307
+ if self.pattern[i] == self.pattern[j]:
308
+ j += 1
309
+ self.lps[i] = j
310
+ i += 1
311
+ else:
312
+ if j != 0:
313
+ j = self.lps[j - 1]
314
+ else:
315
+ self.lps[i] = 0
316
+ i += 1
317
+
318
+
319
+ # Search for the pattern in the text
320
+ def search(self,
321
+ pattern: str,
322
+ text: str,
323
+ ) -> int:
324
+ """
325
+ This function searches for the pattern in the text.
326
+
327
+ Arguments:
328
+ pattern (str): The pattern to search for.
329
+ text (str): The text to search in.
330
+
331
+ Returns:
332
+ int: The index of the pattern in the text (or -1 if the pattern is not found)
333
+
334
+ Raises:
335
+ AssertionError: If the text is not a string.
336
+
337
+ .. note::
338
+ * This is the main function of the KMP search algorithm class.
339
+ """
340
+ # Check the inputs
341
+ assert isinstance(text, str), 'The text must be a string.'
342
+
343
+ # Set the attributes
344
+ self.pattern = pattern
345
+ self.pattern_length = len(self.pattern)
346
+
347
+ # Initialize the lps array
348
+ self.initialize_lps()
349
+
350
+ # Loop over the text
351
+ i = 0
352
+ j = 0
353
+ while i < len(text):
354
+ # Check if the characters match
355
+ if self.pattern[j] == text[i]:
356
+ i += 1
357
+ j += 1
358
+ # Check if the pattern is found
359
+ if j == self.pattern_length:
360
+ return i - j
361
+ # Check if the characters do not match
362
+ elif i < len(text) and self.pattern[j] != text[i]:
363
+ if j != 0:
364
+ j = self.lps[j - 1]
365
+ else:
366
+ i += 1
367
+
368
+ # Return -1 if the pattern is not found
369
+ return -1
370
+
371
+
372
+
373
+
374
+ # Boyer-Moore search algorithm class
375
+ class BoyerMooreSearch:
376
+ """
377
+ This class contains the Boyer-Moore search algorithm.
378
+ """
379
+
380
+ def __init__(self) -> None:
381
+ """
382
+ This function initializes the Boyer-Moore search algorithm class. [BM1977]_
383
+
384
+ The Bayer-Moore search algorithm is a string searching algorithm that uses a heuristic to skip over large sections of the search string, resulting in faster search times than traditional algorithms such as brute-force or Knuth-Morris-Pratt. It is particularly useful for searching for patterns in large amounts of text.
385
+
386
+ .. [BM1977] Boyer, RS and Moore, JS. "A fast string searching algorithm." Communications of the ACM 20.10 (1977): 762-772.
387
+
388
+ A Correct Preprocessing Algorithm for Boyer–Moore String-Searching
389
+
390
+ https://www.cs.jhu.edu/~langmea/resources/lecture_notes/strings_matching_boyer_moore.pdf
391
+
392
+ """
393
+ super().__init__()
394
+
395
+
396
+
397
+ # This is what we call the "prefix - suffix" match case of the good suffix rule
398
+ def aux_get_suffix_prefix_length(self,
399
+ i: int,
400
+ ) -> int:
401
+ """
402
+ This auxiliary function is used to compute the length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.
403
+
404
+ Arguments:
405
+ i (int): The index of the suffix.
406
+
407
+ Returns:
408
+ int: The length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.
409
+ """
410
+
411
+ # pattern [ ....... i ................j]
412
+ # Initialize j to the end of the pattern
413
+ j = self.pattern_length - 1
414
+
415
+ # pattern [ ....... i ....... j .......]
416
+ # Move j to the left until we find a mismatch or until j == i
417
+ while j >= i and self.pattern[j] == self.pattern[j - i]:
418
+ # pattern [ ... j-i ..... i ... j .......]
419
+ j -= 1
420
+
421
+ return self.pattern_length - (j - 1)
422
+
423
+
424
+
425
+ # This is what we call the "substring match" case of the good suffix rule
426
+ def aux_get_matching_substring_length(self,
427
+ j: int,
428
+ ) -> int:
429
+ """
430
+ This auxilary function is used to compute the length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j.
431
+
432
+ It is used in the "substring match" case of the good suffix rule. More specifically, it is used to find when the suffix of the pattern does not match the text at all. Hence, we find the longest suffix of the pattern that matches a substring of the pattern that ends at the index j.
433
+
434
+ Arguments:
435
+ j (int): The end index of the substring.
436
+
437
+ Returns:
438
+ int: The length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j.
439
+
440
+ """
441
+ # Loop over the suffixes of the pattern
442
+ for i in range(j, -1, -1):
443
+ # Check if the substring matches the suffix
444
+ if self.pattern[i:i+(j+1)] == self.pattern[self.pattern_length-(j+1):]:
445
+ return j - i + 1
446
+ # Otherwise, if we get here, the substring does not match any suffix of the pattern
447
+ return 0
448
+
449
+
450
+
451
+ # Creates the "good suffix" skip table
452
+ def create_skip_gs(self) -> None:
453
+ """
454
+ This function creates the "good suffix" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)
455
+
456
+ Arguments:
457
+ None
458
+
459
+ Returns:
460
+ None
461
+
462
+ """
463
+ # Create the good suffix "skip" table
464
+ # TODO(msuzgun): Has an error!
465
+ self.skip_gs = [0] * self.pattern_length
466
+ # skip_gs[i] denotes the number of cells to the right we need to skip if the current character is the i-th character of the pattern
467
+
468
+ # First, we compute the length of the longest suffix of pattern [i:] that matches a prefix of the pattern
469
+ for i in range(self.pattern_length - 1):
470
+ self.skip_gs[i] = self.aux_get_suffix_prefix_length(i)
471
+
472
+ # Set the default skip value to the pattern length
473
+ self.skip_gs[-1] = 1
474
+
475
+ # Second, we compute the length of the longest suffix of the pattern that matches a substring of the pattern that ends at the index j
476
+ for j in range(self.pattern_length - 2):
477
+ k = (self.pattern_length - 1) - self.aux_get_matching_substring_length(j)
478
+ if self.skip_gs[k] == 0:
479
+ self.skip_gs[k] = self.pattern_length - 1 - j
480
+
481
+
482
+
483
+ # Creates the "bad character" skip table
484
+ def create_skip_bc(self) -> None:
485
+ """
486
+ This function creates the "bad character" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)
487
+
488
+ Arguments:
489
+ None
490
+
491
+ Returns:
492
+ None
493
+ """
494
+ # Create the bad character "skip" table
495
+ self.last_occurence = {}
496
+
497
+ # last_occurence[c] denotes the index of the last occurence of the character c in the pattern
498
+ for j in range(self.pattern_length - 1):
499
+ self.last_occurence[self.pattern[j]] = j
500
+
501
+ # Set the default skip value to the pattern length
502
+ self.last_occurence.setdefault(None, self.pattern_length)
503
+
504
+
505
+
506
+ # Searches for the pattern in the text using the Boyer-Moore algorithm
507
+ def search(self,
508
+ pattern: str,
509
+ text: str,
510
+ ) -> int:
511
+ """
512
+ This function searches for the pattern in the text using the Boyer-Moore algorithm.
513
+
514
+ Arguments:
515
+ pattern (str): The pattern to search for.
516
+ text (str): The text to search in.
517
+
518
+ Returns:
519
+ int: The index of the pattern in the text (or -1 if the pattern is not found)
520
+
521
+ Raises:
522
+ AssertionError: If the text or the pattern is not a string.
523
+ """
524
+ # Check both the pattern and the text
525
+ assert isinstance(pattern, str), 'The pattern must be a string.'
526
+ assert isinstance(text, str), 'The text must be a string.'
527
+
528
+ # Set the attributes
529
+ self.pattern = pattern
530
+
531
+ # Length of the pattern
532
+ self.pattern_length = len(self.pattern)
533
+
534
+ # Preprocess the pattern by creating the skip tables for the bad character and good suffix rules, respectively.
535
+ self.create_skip_bc()
536
+ self.create_skip_gs()
537
+
538
+
539
+ # Loop over the text
540
+ i = 0
541
+ while i <= len(text) - self.pattern_length:
542
+ # Loop over the pattern
543
+ j = self.pattern_length - 1
544
+ while j >= 0 and text[i + j] == self.pattern[j]:
545
+ j -= 1
546
+ # Check if the pattern is found
547
+ if j < 0:
548
+ return i
549
+ # Update i
550
+ i += max(j - self.last_occurence.get(text[i + j], self.pattern_length), 1)
551
+
552
+ # Return -1 if the pattern is not found
553
+ return -1
554
+
555
+ def demo():
556
+ demo_sentence = '제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.'
557
+ demo_pattern = '옥포동'
558
+ naive = NaiveSearch().search(demo_pattern, demo_sentence)
559
+ rk = RabinKarpSearch().search(demo_pattern, demo_sentence)
560
+ kmp = KMPSearch().search(demo_pattern, demo_sentence)
561
+ bm = BoyerMooreSearch().search(demo_pattern, demo_sentence)
562
+ print('Naive search: {}'.format(naive))
563
+ print('Rabin-Karp search: {}'.format(rk))
564
+ print('KMP search: {}'.format(kmp))
565
+ print('Boyer-Moore search: {}'.format(bm))
566
+
567
+
568
+ if __name__ == '__main__':
569
+ demo()