nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1315 @@
|
|
1
|
+
"""
|
2
|
+
string2string alignment
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
import multiprocessing
|
32
|
+
|
33
|
+
from typing import List, Union, Tuple, Optional
|
34
|
+
import numpy as np
|
35
|
+
from nltkor.make_requirement import make_requirement
|
36
|
+
|
37
|
+
try:
|
38
|
+
from joblib import Parallel, delayed
|
39
|
+
from tqdm import tqdm
|
40
|
+
except ImportError:
|
41
|
+
requirement = ['joblib', 'tqdm>=4.40.0']
|
42
|
+
file_path = make_requirement(requirement)
|
43
|
+
raise Exception(f"""
|
44
|
+
Need to install Libraries, please pip install below libraries
|
45
|
+
\t pip install joblib
|
46
|
+
\t pip install tqdm>=4.40.0
|
47
|
+
Or, use pip install requirement.txt
|
48
|
+
\t pip install -r {file_path}
|
49
|
+
""")
|
50
|
+
|
51
|
+
|
52
|
+
# for dev purposes
|
53
|
+
import sys
|
54
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
|
55
|
+
from nltkor.misc.string2string_basic_functions import cartesian_product
|
56
|
+
# from string2string_basic_functions import cartesian_product
|
57
|
+
|
58
|
+
|
59
|
+
# Parent class for all alignment algorithms
|
60
|
+
class StringAlignment:
|
61
|
+
"""
|
62
|
+
This class is the parent class for all alignment algorithms implemented in this module.
|
63
|
+
"""
|
64
|
+
# Initialize the class.
|
65
|
+
def __init__(self,
|
66
|
+
match_weight: int = 1.,
|
67
|
+
mismatch_weight: int = -1.,
|
68
|
+
gap_weight: int = -1,
|
69
|
+
gap_char: str = "-",
|
70
|
+
match_dict: dict = None,
|
71
|
+
) -> None:
|
72
|
+
r"""
|
73
|
+
This function initializes the StringAlignment class.
|
74
|
+
|
75
|
+
Arguments:
|
76
|
+
match_weight (int): The weight for a match (default: 1).
|
77
|
+
mismatch_weight (int): The weight for a mismatch (default: -1).
|
78
|
+
gap_weight (int): The weight for a gap (default: -1).
|
79
|
+
gap_char (str): The character for a gap (default: "-").
|
80
|
+
match_dict (dict): The match dictionary (default: None).
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
None
|
84
|
+
|
85
|
+
.. note::
|
86
|
+
|
87
|
+
The match_dict represents a dictionary of the match weights for each pair of characters. For example, if the match_dict is {"A": {"A": 1, "T": -1}, "T": {"A": -1, "T": 1}}, then the match weight for "A" and "A" is 1, the match weight for "A" and "T" is -1, the match weight for "T" and "A" is -1, and the match weight for "T" and "T" is 1.
|
88
|
+
The match_dict is particularly useful when we wish to align (or match) non-identical characters. For example, if we wish to align "A" and "T", we can set the match_dict to {"A": {"T": 1}}. This will ensure that the match weight for "A" and "T" is 1, and the match weight for "A" and "A" and "T" and "T" is 0.
|
89
|
+
"""
|
90
|
+
# Set the weights.
|
91
|
+
self.match_weight = match_weight
|
92
|
+
self.mismatch_weight = mismatch_weight
|
93
|
+
self.gap_weight = gap_weight
|
94
|
+
self.gap_char = gap_char
|
95
|
+
self.match_dict = match_dict
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
def bool_match(self,
|
100
|
+
c1: Union[str, List[str]],
|
101
|
+
c2: Union[str, List[str]],
|
102
|
+
) -> bool:
|
103
|
+
"""
|
104
|
+
The function returns whether two characters match, according to the match dictionary (if it exists).
|
105
|
+
|
106
|
+
Arguments:
|
107
|
+
c1 (str or list of str): The first character or string.
|
108
|
+
c2 (str or list of str): The second character or string.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Whether the two characters match (True or False)
|
112
|
+
"""
|
113
|
+
|
114
|
+
# If there is no match dictionary, return whether the characters are the same.
|
115
|
+
if self.match_dict is None:
|
116
|
+
return c1 == c2
|
117
|
+
# Otherwise, return whether the characters match according to the match dictionary.
|
118
|
+
else:
|
119
|
+
if c1 in self.match_dict and c2 in self.match_dict[c1]:
|
120
|
+
return self.match_dict[c1][c2] >= 0
|
121
|
+
else:
|
122
|
+
return c1 == c2
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
# Get the match weight.
|
127
|
+
def get_match_weight(self,
|
128
|
+
c1: Union[str, List[str]],
|
129
|
+
c2: Union[str, List[str]],
|
130
|
+
) -> float:
|
131
|
+
"""
|
132
|
+
This function returns the match weight of two characters.
|
133
|
+
|
134
|
+
Arguments:
|
135
|
+
c1 (str or list of str): The first character or string.
|
136
|
+
c2 (str or list of str): The second character or string.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
The match weight of the two characters or strings.
|
140
|
+
"""
|
141
|
+
|
142
|
+
# If there is no match dictionary, return the match weight if the characters are the same, and the mismatch weight otherwise.
|
143
|
+
if self.match_dict is None:
|
144
|
+
if c1 == c2:
|
145
|
+
return self.match_weight
|
146
|
+
return self.mismatch_weight
|
147
|
+
# Otherwise, return the match weight according to the match dictionary.
|
148
|
+
else:
|
149
|
+
if c1 in self.match_dict and c2 in self.match_dict[c1]:
|
150
|
+
return self.match_dict[c1][c2]
|
151
|
+
else:
|
152
|
+
if c1 == c2:
|
153
|
+
return self.match_weight
|
154
|
+
return self.mismatch_weight
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
# Get the gap weight.
|
159
|
+
def get_gap_weight(self,
|
160
|
+
c: Union[str, List[str]],
|
161
|
+
) -> float:
|
162
|
+
"""
|
163
|
+
This function returns the gap weight of a character or string.
|
164
|
+
|
165
|
+
Arguments:
|
166
|
+
c (str or list of str): The character or string.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
The gap weight of the character or string.
|
170
|
+
"""
|
171
|
+
|
172
|
+
# If there is no match dictionary, return the gap weight.
|
173
|
+
if self.match_dict is None:
|
174
|
+
return self.gap_weight
|
175
|
+
# Otherwise, return the gap weight according to the match dictionary.
|
176
|
+
else:
|
177
|
+
if c in self.match_dict and self.gap_char in self.match_dict[c]:
|
178
|
+
return self.match_dict[c][self.gap_char]
|
179
|
+
else:
|
180
|
+
return self.gap_weight
|
181
|
+
|
182
|
+
|
183
|
+
|
184
|
+
# Get the score of a character pair.
|
185
|
+
def get_score(self,
|
186
|
+
c1: Union[str, List[str]],
|
187
|
+
c2: Union[str, List[str]],
|
188
|
+
) -> float:
|
189
|
+
"""
|
190
|
+
This function returns the score of a character or string pair.
|
191
|
+
|
192
|
+
Arguments:
|
193
|
+
c1 (str or list of str): The first character or string.
|
194
|
+
c2 (str or list of str): The second character or string.
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
The score of the character or string pair.
|
198
|
+
"""
|
199
|
+
# If the characters are the same, return the match weight.
|
200
|
+
if c1 == c2:
|
201
|
+
return self.match_weight
|
202
|
+
# If one of the characters is a gap, return the gap weight.
|
203
|
+
elif c1 == self.gap_char or c2 == self.gap_char:
|
204
|
+
return self.gap_weight
|
205
|
+
# Otherwise, return the mismatch weight.
|
206
|
+
else:
|
207
|
+
return self.mismatch_weight
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
# Get the alignment score of two strings (or list of strings).
|
212
|
+
# (This is the sum of the scores of the characters.)
|
213
|
+
def get_alignment_score(self,
|
214
|
+
str1: Union[str, List[str]],
|
215
|
+
str2: Union[str, List[str]],
|
216
|
+
) -> float:
|
217
|
+
"""
|
218
|
+
This function returns the alignment score of two strings (or list of strings).
|
219
|
+
|
220
|
+
Arguments:
|
221
|
+
str1 (str or list of str): The first string (or list of strings).
|
222
|
+
str2 (str or list of str): The second string (or list of strings).
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
The alignment score of the two strings (or list of strings).
|
226
|
+
"""
|
227
|
+
# Get the alignment score by summing the scores of the characters.
|
228
|
+
score = 0.
|
229
|
+
for c1, c2 in zip(str1, str2):
|
230
|
+
score += self.get_score(c1, c2)
|
231
|
+
return score
|
232
|
+
|
233
|
+
|
234
|
+
|
235
|
+
# Add gaps to the shorter string.
|
236
|
+
def add_space_to_shorter(self,
|
237
|
+
str1: str,
|
238
|
+
str2: str,
|
239
|
+
) -> Tuple[str, str]:
|
240
|
+
"""
|
241
|
+
This function adds gaps to the shorter string to make the two strings the same length.
|
242
|
+
|
243
|
+
Arguments:
|
244
|
+
str1 (str): The first string.
|
245
|
+
str2 (str): The second string.
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
The two strings with the same length.
|
249
|
+
"""
|
250
|
+
# Get the maximum length of the two strings.
|
251
|
+
max_len = max(len(str1), len(str2))
|
252
|
+
|
253
|
+
# Pad the shorter string with gaps.
|
254
|
+
if len(str1) < max_len:
|
255
|
+
str1 = str1 + ' ' * (max_len - len(str1))
|
256
|
+
elif len(str2) < max_len:
|
257
|
+
str2 = str2 + ' ' * (max_len - len(str2))
|
258
|
+
|
259
|
+
# Return the padded strings.
|
260
|
+
return str1, str2
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
# Print the alignment.
|
265
|
+
def print_alignment(self,
|
266
|
+
str1: str,
|
267
|
+
str2: str,
|
268
|
+
) -> None:
|
269
|
+
"""
|
270
|
+
This function prints the alignment of two strings.
|
271
|
+
|
272
|
+
Arguments:
|
273
|
+
str1 (str): The first string.
|
274
|
+
str2 (str): The second string.
|
275
|
+
|
276
|
+
Returns:
|
277
|
+
None.
|
278
|
+
"""
|
279
|
+
print(str1)
|
280
|
+
print(str2)
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
def get_alignment_strings_and_indices(self,
|
286
|
+
str1: str,
|
287
|
+
str2: str,
|
288
|
+
separator: str = ' | ',
|
289
|
+
) -> Tuple[Tuple[List[int], List[int]], List[str], List[str]]:
|
290
|
+
"""
|
291
|
+
This function returns the indices of the aligned characters, and the two strings separated by the separator.
|
292
|
+
|
293
|
+
Arguments:
|
294
|
+
str1 (str): The first string, separated by the separator.
|
295
|
+
str2 (str): The second string, separated by the separator.
|
296
|
+
separator (str): The separator.
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
The indices of the aligned characters, and the two strings separated by the separator.
|
300
|
+
|
301
|
+
"""
|
302
|
+
sym1 = str1.split(separator)
|
303
|
+
sym2 = str2.split(separator)
|
304
|
+
|
305
|
+
alignment_indices = []
|
306
|
+
# Get the indices of the aligned characters.
|
307
|
+
for i in range(len(sym1)):
|
308
|
+
if self.bool_match(sym1[i], sym2[i]): #sym1[i] == sym2[i]:
|
309
|
+
alignment_indices.append((i, i))
|
310
|
+
|
311
|
+
return alignment_indices, sym1, sym2
|
312
|
+
|
313
|
+
|
314
|
+
|
315
|
+
# Compute miltiple pairs in parallel using multiprocessing.
|
316
|
+
def compute_multiple_pairs(self,
|
317
|
+
pairs: List[Tuple[Union[str, List[str]], Union[str, List[str]]]],
|
318
|
+
num_workers: int = 1,
|
319
|
+
method: str = "multiprocessing",
|
320
|
+
**kwargs,
|
321
|
+
) -> List[Tuple[float, Union[List[str], List[List[str]]]]]:
|
322
|
+
"""
|
323
|
+
This "meta" function computes the alignment score of multiple pairs of strings (or lists of strings) in parallel.
|
324
|
+
|
325
|
+
Arguments:
|
326
|
+
pairs (list of tuples): A list of tuples, where each tuple contains two strings (or lists of strings).
|
327
|
+
num_workers (int): The number of workers to use for multiprocessing.
|
328
|
+
method (str): The method to use for parallelization. Options are "multiprocessing" and "joblib".
|
329
|
+
**kwargs: Additional keyword arguments to pass to the compute function.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
A list of tuples, where each tuple contains the alignment score and the alignment of the two strings (or lists of strings), based on the compute function.
|
333
|
+
|
334
|
+
.. note::
|
335
|
+
* This function uses multiprocessing, either via the multiprocessing module or via joblib.
|
336
|
+
* The multiprocessing module is used by default, but joblib can be used instead by setting method="joblib".
|
337
|
+
* We found that joblib is empirically faster than multiprocessing for this particular problem.
|
338
|
+
"""
|
339
|
+
# Compute the alignment score of multiple pairs of strings in parallel.
|
340
|
+
if method == "multiprocessing":
|
341
|
+
with multiprocessing.Pool(num_workers) as pool:
|
342
|
+
results = pool.starmap(
|
343
|
+
self.compute,
|
344
|
+
[(pair[0], pair[1], kwargs) for pair in pairs],
|
345
|
+
)
|
346
|
+
elif method == "joblib":
|
347
|
+
results = Parallel(n_jobs=num_workers)(
|
348
|
+
delayed(self.compute)(pair[0], pair[1], **kwargs) for pair in tqdm(pairs)
|
349
|
+
)
|
350
|
+
else:
|
351
|
+
raise ValueError(f"Invalid method: {method}")
|
352
|
+
return results
|
353
|
+
|
354
|
+
|
355
|
+
|
356
|
+
# Needleman-Wunsch algorithm class
|
357
|
+
class NeedlemanWunsch(StringAlignment):
|
358
|
+
# Initialize the class.
|
359
|
+
def __init__(self,
|
360
|
+
match_weight: float = 1.,
|
361
|
+
mismatch_weight: float = -1.,
|
362
|
+
gap_weight: float = -1.,
|
363
|
+
gap_char: str = "-",
|
364
|
+
match_dict: dict = None,
|
365
|
+
) -> None:
|
366
|
+
r"""
|
367
|
+
This function initializes the Needleman-Wunsch algorithm, which is used to get the global alignment of sequences (e.g., strings or lists of strings) such as DNA sequences.
|
368
|
+
|
369
|
+
The algorithm is described in the following paper: [Needleman1970]_
|
370
|
+
|
371
|
+
Arguments:
|
372
|
+
match_weight (float): The weight of a match (default: 1.).
|
373
|
+
mismatch_weight (float): The weight of a mismatch (default: -1.).
|
374
|
+
gap_weight (float): The weight of a gap (default: -1.).
|
375
|
+
match_dict (dict): The match dictionary (default: None).
|
376
|
+
gap_char (str): The gap character (default: "-").
|
377
|
+
|
378
|
+
.. [Needleman1970] Needleman, S.B. and Wunsch, C.D., 1970. A General Method Applicable to the Search for Similarities in the Amino Acid Sequence of Two Proteins. Journal of Molecular Biology, 48(3), pp.443-453.
|
379
|
+
"""
|
380
|
+
# Initialize using the parent class.
|
381
|
+
super().__init__(
|
382
|
+
match_weight=match_weight,
|
383
|
+
mismatch_weight=mismatch_weight,
|
384
|
+
gap_weight=gap_weight,
|
385
|
+
match_dict=match_dict,
|
386
|
+
gap_char=gap_char,
|
387
|
+
)
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
# The auxilary backtrack function.
|
392
|
+
def backtrack(self,
|
393
|
+
score_matrix: np.ndarray,
|
394
|
+
str1: Union[str, List[str]],
|
395
|
+
str2: Union[str, List[str]],
|
396
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]]]:
|
397
|
+
r"""
|
398
|
+
This function is an auxilary function, used by the get_alignment() function, that backtracks the score matrix to get the aligned strings.
|
399
|
+
|
400
|
+
Arguments:
|
401
|
+
score_matrix (np.ndarray): The score matrix.
|
402
|
+
str1: The first string (or list of strings).
|
403
|
+
str2: The second string (or list of strings).
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
The aligned strings (or list of strings). The aligned strings are padded with spaces to make them the same length.
|
407
|
+
|
408
|
+
.. note::
|
409
|
+
* The score matrix is assumed to be a 2D numpy array.
|
410
|
+
* There might be multiple optimal alignments. This function returns one of the optimal alignments.
|
411
|
+
* The backtracking step has a time complexity of :math:`O(m + n)`, where :math:`n` and :math:`m` are the lengths of the strings str1 and str2, respectively.
|
412
|
+
"""
|
413
|
+
|
414
|
+
# Lengths of strings str1 and str2, respectively.
|
415
|
+
len1 = len(str1)
|
416
|
+
len2 = len(str2)
|
417
|
+
|
418
|
+
# Initialize the aligned strings.
|
419
|
+
aligned_str1 = ""
|
420
|
+
aligned_str2 = ""
|
421
|
+
|
422
|
+
# Initialize the current position.
|
423
|
+
i = len1
|
424
|
+
j = len2
|
425
|
+
|
426
|
+
# Backtrack until the current position is (0, 0).
|
427
|
+
while i > 0 and j > 0:
|
428
|
+
# If the current position is the result of a match/mismatch, add the characters to the aligned strings and move to the diagonal.
|
429
|
+
if score_matrix[i, j] == score_matrix[i - 1, j - 1] + self.get_score(str1[i - 1], str2[j - 1]):
|
430
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i - 1], str2[j - 1])
|
431
|
+
i -= 1
|
432
|
+
j -= 1
|
433
|
+
# If the current position is the result of a gap in str1, add a gap to str1 and the character to str2 and move to the left.
|
434
|
+
elif score_matrix[i, j] == score_matrix[i, j - 1] + self.get_gap_weight(str2[j - 1]):
|
435
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(self.gap_char, str2[j - 1])
|
436
|
+
j -= 1
|
437
|
+
# If the current position is the result of a gap in str2, add a gap to str2 and the character to str1 and move up.
|
438
|
+
elif score_matrix[i, j] == score_matrix[i - 1, j] + self.get_gap_weight(str1[i - 1]):
|
439
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i - 1], self.gap_char)
|
440
|
+
i -= 1
|
441
|
+
|
442
|
+
# Add the characters to the aligned strings.
|
443
|
+
aligned_str1 = insert_str1 + ' | ' + aligned_str1
|
444
|
+
aligned_str2 = insert_str2 + ' | ' + aligned_str2
|
445
|
+
|
446
|
+
# If there are still characters in str1, add them to the aligned strings.
|
447
|
+
while i > 0:
|
448
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i - 1], self.gap_char)
|
449
|
+
aligned_str1 = insert_str1 + ' | ' + aligned_str1
|
450
|
+
aligned_str2 = insert_str2 + ' | ' + aligned_str2
|
451
|
+
i -= 1
|
452
|
+
|
453
|
+
# If there are still characters in str2, add them to the aligned strings.
|
454
|
+
while j > 0:
|
455
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(self.gap_char, str2[j - 1])
|
456
|
+
aligned_str1 = insert_str1 + ' | ' + aligned_str1
|
457
|
+
aligned_str2 = insert_str2 + ' | ' + aligned_str2
|
458
|
+
j -= 1
|
459
|
+
|
460
|
+
# Remove the last ' | ' from the aligned strings.
|
461
|
+
aligned_str1 = aligned_str1[:-3]
|
462
|
+
aligned_str2 = aligned_str2[:-3]
|
463
|
+
|
464
|
+
# Return the aligned strings.
|
465
|
+
return aligned_str1, aligned_str2
|
466
|
+
|
467
|
+
|
468
|
+
|
469
|
+
# Get the alignment of two strings (or list of strings).
|
470
|
+
def get_alignment(self,
|
471
|
+
str1: Union[str, List[str]],
|
472
|
+
str2: Union[str, List[str]],
|
473
|
+
return_score_matrix: bool = False,
|
474
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]], Optional[np.ndarray]]:
|
475
|
+
r"""
|
476
|
+
This is the main function in the NeedlemanWunsch class that gets the alignment of two strings (or list of strings) by using the Needleman-Wunsch algorithm.
|
477
|
+
|
478
|
+
Arguments:
|
479
|
+
str1: The first string (or list of strings).
|
480
|
+
str2: The second string (or list of strings).
|
481
|
+
return_score_matrix (bool): Whether to return the score matrix. (Default: False)
|
482
|
+
|
483
|
+
Returns:
|
484
|
+
The aligned strings (or list of strings). The aligned strings are padded with spaces to make them the same length. If return_score_matrix is True, the score matrix is also returned.
|
485
|
+
|
486
|
+
.. note::
|
487
|
+
* There might be multiple optimal alignments. This function returns one of the optimal alignments.
|
488
|
+
* The time complexity of this function is :math:`O(nm)`, where :math:`n` and :math:`m` are the lengths of the strings str1 and str2, respectively.
|
489
|
+
* The space complexity of this function is :math:`O(nm)`.
|
490
|
+
* The " | " character is used to separate the elements in the aligned strings.
|
491
|
+
"""
|
492
|
+
|
493
|
+
# Lengths of strings str1 and str2, respectively.
|
494
|
+
len1 = len(str1)
|
495
|
+
len2 = len(str2)
|
496
|
+
|
497
|
+
# Initialize the score matrix.
|
498
|
+
score_matrix = np.zeros((len1 + 1, len2 + 1))
|
499
|
+
|
500
|
+
# Initialize the first row and column of the score matrix.
|
501
|
+
for i in range(1, len1 + 1):
|
502
|
+
score_matrix[i, 0] = score_matrix[i - 1, 0] + self.get_gap_weight(str1[i - 1])
|
503
|
+
for j in range(1, len2 + 1):
|
504
|
+
score_matrix[0, j] = score_matrix[0, j - 1] + self.get_gap_weight(str2[j - 1])
|
505
|
+
|
506
|
+
# Dynamic programming solution (Needleman-Wunsch algorithm):
|
507
|
+
for i in range(1, len1 + 1):
|
508
|
+
for j in range(1, len2 + 1):
|
509
|
+
# Get the scores of the three possible paths.
|
510
|
+
match_score = score_matrix[i - 1, j - 1] + self.get_match_weight(str1[i - 1], str2[j - 1])
|
511
|
+
delete_score = score_matrix[i - 1, j] + self.get_gap_weight(str1[i - 1])
|
512
|
+
insert_score = score_matrix[i, j - 1] + self.get_gap_weight(str2[j - 1])
|
513
|
+
|
514
|
+
# Get the maximum score.
|
515
|
+
max_score = max(match_score, delete_score, insert_score)
|
516
|
+
|
517
|
+
# Fill the score matrix.
|
518
|
+
score_matrix[i, j] = max_score
|
519
|
+
|
520
|
+
# Get the alignment.
|
521
|
+
aligned_str1, aligned_str2 = self.backtrack(score_matrix, str1, str2)
|
522
|
+
|
523
|
+
# Return the alignment and the score matrix.
|
524
|
+
if return_score_matrix:
|
525
|
+
return aligned_str1, aligned_str2, score_matrix
|
526
|
+
return aligned_str1, aligned_str2
|
527
|
+
|
528
|
+
|
529
|
+
|
530
|
+
# Hirschberg algorithm (linear space algorithm).
|
531
|
+
class Hirschberg(NeedlemanWunsch):
|
532
|
+
def __init__(self,
|
533
|
+
match_weight: Union[int, float] = 2,
|
534
|
+
mismatch_weight: Union[int, float] = -1,
|
535
|
+
gap_weight: Union[int, float] = -2,
|
536
|
+
gap_char: str = '-',
|
537
|
+
match_dict: dict = None,
|
538
|
+
) -> None:
|
539
|
+
r"""
|
540
|
+
This function initializes the parameters of the Hirschberg algorithm [H1975]_, a space-efficient solution to the global alignment problem. It inherits from the NeedlemanWunsch class.
|
541
|
+
|
542
|
+
Arguments:
|
543
|
+
match_weight (int or float): The weight of a match (default: 2).
|
544
|
+
mismatch_weight (int or float): The weight of a mismatch (default: -1).
|
545
|
+
gap_weight (int or float): The weight of a gap (default: -2).
|
546
|
+
gap_char (str): The character used to represent a gap (default: '-').
|
547
|
+
match_dict (dict): The dictionary that maps the characters to their match weights (default: None).
|
548
|
+
|
549
|
+
.. note::
|
550
|
+
* The default values are the same as the ones used in the Needleman-Wunsch algorithm.
|
551
|
+
* The time complexity of Hirschberg's algorithm is :math:`O(nm)`, where :math:`n` and :math:`m` are the lengths of the strings str1 and str2, respectively.
|
552
|
+
* The space complexity of Hirschberg's algorithm is, on the other hand, :math:`O(min(n, m))`.
|
553
|
+
* We benefited from the following resources to implement this class: [K2015]_, [W2012]_, [M2010]_, [K2002]_.
|
554
|
+
|
555
|
+
.. [H1975] Hirschberg, Daniel S. "A linear space algorithm for computing maximal common subsequences." Communications of the ACM 18.6 (1975): 341-343.
|
556
|
+
.. [K2015] Kellis, Manolis. Computational Biology: Genomes, Networks, Evolution (MIT Course 6.047/6.878) — https://ocw.mit.edu/ans7870/6/6.047/f15/MIT6_047F15_Compiled.pdf (Accessed on 02-16-2023) (Section 2.5.8; Linear Space Alignment, pg. 38-39).
|
557
|
+
.. [W2012] Wayne, Kevin. Lecture Slides for Algorithm Design - Dynamic Programming II (https://www.cs.princeton.edu/~wayne/kleinberg-tardos/pdf/06DynamicProgrammingII.pdf) (Accessed on 02-16-2023).
|
558
|
+
.. [M2010] Moura, Lucia. Algorithms in Bioinformatics: Lectures 3-5 - Sequence Similarity. Fall 2010. University of Ottawa. (https://www.site.uottawa.ca/~lucia/courses/2010/comp5511/lectures/03-05.pdf) (Accessed on 02-16-2023).
|
559
|
+
.. [K2002] Kingsford, Carl. Lecture 7: Dynamic Programming. 2002. Carnegie Mellon University. (https://www.cs.cmu.edu/~ckingsf/class/02-714/Lec07-linspace.pdf) (Accessed on 02-16-2023).
|
560
|
+
"""
|
561
|
+
|
562
|
+
# Initialize the Needleman-Wunsch algorithm using the super() function.
|
563
|
+
super().__init__(
|
564
|
+
match_weight=match_weight,
|
565
|
+
mismatch_weight=mismatch_weight,
|
566
|
+
gap_weight=gap_weight,
|
567
|
+
match_dict=match_dict,
|
568
|
+
gap_char=gap_char,
|
569
|
+
)
|
570
|
+
|
571
|
+
|
572
|
+
|
573
|
+
# Get the alignment of two strings (or list of strings).
|
574
|
+
def get_alignment(self,
|
575
|
+
str1: Union[str, List[str]],
|
576
|
+
str2: Union[str, List[str]],
|
577
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]]]:
|
578
|
+
r"""
|
579
|
+
This function gets the alignment of two strings (or list of strings) by using the Hirschberg algorithm.
|
580
|
+
|
581
|
+
Arguments:
|
582
|
+
str1: The first string (or list of strings).
|
583
|
+
str2: The second string (or list of strings).
|
584
|
+
|
585
|
+
Returns:
|
586
|
+
The aligned strings as a tuple of two strings (or list of strings).
|
587
|
+
|
588
|
+
.. note::
|
589
|
+
* As a notable improvement of the Needleman-Wunsch algorithm, Hirschberg's algorithm combines both divide-and-conquer and dynamic programming principles. This algorithm provides a space-efficient solution, with a time complexity of :math:`O(nm)`, where :math:`n` and :math:`m` are the lengths of the strings str1 and str2, respectively. Its space complexity, however, is :math:`O(min(n, m))`.
|
590
|
+
* To further improve the algorithm, one may limit the number of insertions and deletions in the alignment. This strategy can notably reduce the time complexity from :math:`O(mn)` to :math:`O((m + n) * k)`, where k is the maximum number of insertions or deletions allowed. By constraining the alignment around the diagonal in the score matrix with (2 * k + 1) cells, the new version of the algorithm can be called the k-banded Hirschberg algorithm. If k is arbitrarily small, this modification can lead to a significant improvement in the time complexity.
|
591
|
+
* The k-banded Hirschberg algorithm, with its time complexity of :math:`O((m + n) * k)`, is a powerful strategy that balances space and time requirements in sequence alignment.
|
592
|
+
"""
|
593
|
+
|
594
|
+
# Lengths of strings str1 and str2, respectively.
|
595
|
+
len1 = len(str1)
|
596
|
+
len2 = len(str2)
|
597
|
+
|
598
|
+
# Check if the length of str1 is less than or equal to the length of str2.
|
599
|
+
if len1 >= len2:
|
600
|
+
# Get the alignment.
|
601
|
+
aligned_str1, aligned_str2 = self.get_alignment_helper(str1, str2)
|
602
|
+
else:
|
603
|
+
# Get the alignment.
|
604
|
+
aligned_str2, aligned_str1 = self.get_alignment_helper(str2, str1)
|
605
|
+
|
606
|
+
# Remove the trailing " | " from the aligned strings (if any).
|
607
|
+
aligned_str1 = aligned_str1.strip(" | ")
|
608
|
+
aligned_str2 = aligned_str2.strip(" | ")
|
609
|
+
|
610
|
+
# Replace the " | | " with " | " in the aligned strings (if any).
|
611
|
+
aligned_str1 = aligned_str1.replace('| |', '|')
|
612
|
+
aligned_str2 = aligned_str2.replace('| |', '|')
|
613
|
+
|
614
|
+
# Return the alignment.
|
615
|
+
return aligned_str1, aligned_str2
|
616
|
+
|
617
|
+
|
618
|
+
|
619
|
+
# Get the alignment of two strings (or list of strings).
|
620
|
+
def get_alignment_helper(self,
|
621
|
+
str1: Union[str, List[str]],
|
622
|
+
str2: Union[str, List[str]],
|
623
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]]]:
|
624
|
+
"""
|
625
|
+
This is a helper function that is called by the get_alignment() function. This function gets the alignment of two strings (or list of strings) by using the Hirschberg algorithm.
|
626
|
+
|
627
|
+
Arguments:
|
628
|
+
str1: The first string (or list of strings).
|
629
|
+
str2: The second string (or list of strings).
|
630
|
+
|
631
|
+
Returns:
|
632
|
+
The aligned strings as a tuple of two strings (or list of strings).
|
633
|
+
|
634
|
+
.. note::
|
635
|
+
* We assume that the length of str1 is greater than or equal to the length of str2.
|
636
|
+
"""
|
637
|
+
|
638
|
+
# Lengths of strings str1 and str2, respectively.
|
639
|
+
len1 = len(str1)
|
640
|
+
len2 = len(str2)
|
641
|
+
|
642
|
+
# Initialize the aligned strings.
|
643
|
+
aligned_str1 = ""
|
644
|
+
aligned_str2 = ""
|
645
|
+
|
646
|
+
# Check if the length of str1 is 0.
|
647
|
+
if len1 == 0:
|
648
|
+
# Add gap characters to the shorter string (i.e., str1).
|
649
|
+
for j in range(1, len2+1):
|
650
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(self.gap_char, str2[j-1])
|
651
|
+
aligned_str1 = aligned_str1 + ' | ' + insert_str1
|
652
|
+
aligned_str2 = aligned_str2 + ' | ' + insert_str2
|
653
|
+
elif len2 == 0:
|
654
|
+
# Add gap characters to the shorter string (i.e., str2).
|
655
|
+
for i in range(1, len1+1):
|
656
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i-1], self.gap_char)
|
657
|
+
aligned_str1 = aligned_str1 + ' | ' + insert_str1
|
658
|
+
aligned_str2 = aligned_str2 + ' | ' + insert_str2
|
659
|
+
elif len1 == 1 or len2 == 1:
|
660
|
+
# Get the alignment of two strings (or list of strings) by using the Needleman-Wunsch algorithm.
|
661
|
+
aligned_str1, aligned_str2 = super().get_alignment(str1, str2)
|
662
|
+
else:
|
663
|
+
# Get the middle index of str1.
|
664
|
+
mid1 = len1 // 2
|
665
|
+
|
666
|
+
# Get the scores of the left and right substrings.
|
667
|
+
score_row_left = self.nw_score(str1[:mid1], str2)
|
668
|
+
# Score-Right = Reverse ( NW-Score( Reverse(Str1-Mid1), Reverse(Str2) ) )
|
669
|
+
score_row_right = self.nw_score(str1[mid1:][::-1], str2[::-1])[::-1]
|
670
|
+
|
671
|
+
# Get mid2 = arg max score_row_left + score_row_right
|
672
|
+
mid2 = self.get_middle_index(score_row_left, score_row_right)
|
673
|
+
|
674
|
+
# Get the alignment of the left and right substrings.
|
675
|
+
aligned_str1_left, aligned_str2_left = self.get_alignment_helper(str1[:mid1], str2[:mid2])
|
676
|
+
aligned_str1_right, aligned_str2_right = self.get_alignment_helper(str1[mid1:], str2[mid2:])
|
677
|
+
|
678
|
+
# Combine the aligned strings.
|
679
|
+
# Make sure to add ' | ' between the aligned strings only if the aligned strings are not empty.
|
680
|
+
# This is to avoid adding ' | ' at the beginning and end of the aligned strings.
|
681
|
+
if aligned_str1_left != "" and aligned_str1_right != "":
|
682
|
+
aligned_str1 = aligned_str1_left + ' | ' + aligned_str1_right
|
683
|
+
else:
|
684
|
+
aligned_str1 = aligned_str1_left + aligned_str1_right
|
685
|
+
if aligned_str2_left != "" and aligned_str2_right != "":
|
686
|
+
aligned_str2 = aligned_str2_left + ' | ' + aligned_str2_right
|
687
|
+
else:
|
688
|
+
aligned_str2 = aligned_str2_left + aligned_str2_right
|
689
|
+
|
690
|
+
# Return the aligned strings.
|
691
|
+
return aligned_str1, aligned_str2
|
692
|
+
|
693
|
+
|
694
|
+
|
695
|
+
# Return the last row of the score matrix.
|
696
|
+
def nw_score(self,
|
697
|
+
str1: Union[str, List[str]],
|
698
|
+
str2: Union[str, List[str]],
|
699
|
+
) -> List[float]:
|
700
|
+
"""
|
701
|
+
This function returns the last row of the score matrix.
|
702
|
+
|
703
|
+
Arguments:
|
704
|
+
str1: The first string (or list of strings).
|
705
|
+
str2: The second string (or list of strings).
|
706
|
+
|
707
|
+
Returns:
|
708
|
+
The last row of the score matrix.
|
709
|
+
"""
|
710
|
+
|
711
|
+
# Lengths of strings str1 and str2, respectively.
|
712
|
+
len1 = len(str1)
|
713
|
+
len2 = len(str2)
|
714
|
+
|
715
|
+
# Create a 2 x (len2 + 1) matrix.
|
716
|
+
score_matrix = np.zeros((2, len2 + 1))
|
717
|
+
|
718
|
+
# Initialize the first row of the score matrix.
|
719
|
+
for j in range(1, len2 + 1):
|
720
|
+
score_matrix[0, j] = score_matrix[0, j - 1] + self.get_gap_weight(str2[j - 1]) # insertion cost
|
721
|
+
|
722
|
+
# Update the score matrix.
|
723
|
+
for i in range(1, len1 + 1):
|
724
|
+
score_matrix[1, 0] = score_matrix[0, 0] + self.get_gap_weight(str1[i - 1]) # deletion cost
|
725
|
+
|
726
|
+
for j in range(1, len2 + 1):
|
727
|
+
score_matrix[1, j] = max(
|
728
|
+
score_matrix[0, j - 1] + self.get_score(str1[i - 1], str2[j - 1]), # match/mismatch cost
|
729
|
+
score_matrix[0, j] + self.get_gap_weight(str1[i - 1]), # deletion cost
|
730
|
+
score_matrix[1, j - 1] + self.get_gap_weight(str2[j - 1]) # insertion cost
|
731
|
+
)
|
732
|
+
|
733
|
+
# Update the score matrix.
|
734
|
+
score_matrix[0, :] = score_matrix[1, :]
|
735
|
+
|
736
|
+
# Return the last row of the score matrix.
|
737
|
+
return score_matrix[1, :]
|
738
|
+
|
739
|
+
|
740
|
+
|
741
|
+
# Get the middle index of str2.
|
742
|
+
def get_middle_index(self,
|
743
|
+
score_left: List[float],
|
744
|
+
score_right: List[float],
|
745
|
+
) -> int:
|
746
|
+
"""
|
747
|
+
This function gets the middle index of str2.
|
748
|
+
|
749
|
+
Arguments:
|
750
|
+
score_left: The score of the left sublist.
|
751
|
+
score_right: The score of the right sublist.
|
752
|
+
|
753
|
+
Returns:
|
754
|
+
The middle index of str2.
|
755
|
+
"""
|
756
|
+
|
757
|
+
# Length of score_left.
|
758
|
+
len_score_left = len(score_left)
|
759
|
+
|
760
|
+
# Initialize the middle index.
|
761
|
+
mid2 = 0
|
762
|
+
|
763
|
+
# Initialize the maximum score with the possible minimum score.
|
764
|
+
# Oh dear, initially I used 0 as the maximum score, but that was wrong.
|
765
|
+
# The maximum score can be negative, so we need to use the possible minimum score instead, which is -float('inf').
|
766
|
+
max_score = -float('inf')
|
767
|
+
|
768
|
+
# Get the middle index.
|
769
|
+
for i in range(len_score_left):
|
770
|
+
if score_left[i] + score_right[i] > max_score:
|
771
|
+
mid2 = i
|
772
|
+
max_score = score_left[i] + score_right[i]
|
773
|
+
|
774
|
+
# Return the middle index.
|
775
|
+
return mid2
|
776
|
+
|
777
|
+
|
778
|
+
|
779
|
+
# Smith-Waterman algorithm (local alignment).
|
780
|
+
class SmithWaterman(NeedlemanWunsch):
|
781
|
+
def __init__(self,
|
782
|
+
match_weight: Union[int, float] = 1,
|
783
|
+
mismatch_weight: Union[int, float] = -1,
|
784
|
+
gap_weight: Union[int, float] = -1,
|
785
|
+
gap_char: str = '-',
|
786
|
+
match_dict: dict = None,
|
787
|
+
) -> None:
|
788
|
+
r"""
|
789
|
+
This function initializes the class variables of the Smith-Waterman algorithm, used for local alignment of sequences (e.g., strings or lists of strings) such as DNA sequences.
|
790
|
+
|
791
|
+
Arguments:
|
792
|
+
match_weight (int or float): The weight of a match (default: 1).
|
793
|
+
mismatch_weight (int or float): The weight of a mismatch (default: -1).
|
794
|
+
gap_weight (int or float): The weight of a gap (default: -1).
|
795
|
+
gap_char (str): The character used to represent a gap (default: '-').
|
796
|
+
match_dict (dict): The dictionary that maps the characters to their match weights (default: None).
|
797
|
+
|
798
|
+
.. note::
|
799
|
+
* The default values are the same as the Needleman-Wunsch algorithm.
|
800
|
+
* The Smith-Waterman algorithm can be thought of a variant of the Needleman-Wunsch algorithm, where the max function is replaced by the max function with 0 as the default value and the first row and column of the score matrix are initialized to 0.
|
801
|
+
|
802
|
+
.. math::
|
803
|
+
:nowrap:
|
804
|
+
|
805
|
+
\begin{align}
|
806
|
+
\texttt{max score} &= \texttt{max}(\texttt{match score}, \texttt{delete score}, \texttt{insert score}, 0)
|
807
|
+
\end{align}
|
808
|
+
|
809
|
+
* We only need to override the get_alignment and backtrack functions of the NeedlemanWunsch class.
|
810
|
+
* The space and time complexities of the Smith-Waterman algorithm are the same as the Needleman-Wunsch algorithm, that is, :math:`O(mn)` and :math:`O(mn)`, respectively, where :math:`n` and :math:`m` are the lengths of the two strings (or lists of strings).
|
811
|
+
"""
|
812
|
+
|
813
|
+
# Initialize the SmithWaterman class using its parent class, NeedlemanWunsch.
|
814
|
+
super().__init__(
|
815
|
+
match_weight=match_weight,
|
816
|
+
mismatch_weight=mismatch_weight,
|
817
|
+
gap_weight=gap_weight,
|
818
|
+
match_dict=match_dict,
|
819
|
+
gap_char=gap_char,
|
820
|
+
)
|
821
|
+
|
822
|
+
|
823
|
+
|
824
|
+
# Backtrack the score matrix.
|
825
|
+
# Override the backtrack function of the NeedlemanWunsch class.
|
826
|
+
def backtrack(self,
|
827
|
+
score_matrix: np.ndarray,
|
828
|
+
str1: Union[str, List[str]],
|
829
|
+
str2: Union[str, List[str]],
|
830
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]]]:
|
831
|
+
"""
|
832
|
+
This function overrides the backtrack function of the NeedlemanWunsch class to get an optimal local alignment between two strings (or list of strings).
|
833
|
+
|
834
|
+
Arguments:
|
835
|
+
score_matrix (numpy.ndarray): The score matrix.
|
836
|
+
str1 (str or list of str): The first string (or list of strings).
|
837
|
+
str2 (str or list of str): The second string (or list of strings).
|
838
|
+
|
839
|
+
Returns:
|
840
|
+
The aligned substrings as a tuple of two strings (or list of strings).
|
841
|
+
|
842
|
+
.. note::
|
843
|
+
* The backtrack function used in this function is different from the backtrack function used in the Needleman-Wunsch algorithm. Here we start from the position with the highest score in the score matrix and trace back to the first position that has a score of zero. This is because the highest-scoring subsequence may not necessarily span the entire length of the sequences being aligned.
|
844
|
+
* On the other hand, the backtrack function used in the Needleman-Wunsch algorithm traces back through the entire score matrix, starting from the bottom-right corner, to determine the optimal alignment path. This is because the algorithm seeks to find the global alignment of two sequences, which means aligning them from the beginning to the end.
|
845
|
+
"""
|
846
|
+
|
847
|
+
# Initialize the aligned substrings.
|
848
|
+
aligned_str1 = ""
|
849
|
+
aligned_str2 = ""
|
850
|
+
|
851
|
+
# Get the position with the maximum score in the score matrix.
|
852
|
+
# TODO(msuzgun): See if there is a faster way to get the position with the maximum score in the score matrix.
|
853
|
+
i, j = np.unravel_index(np.argmax(score_matrix, axis=None), score_matrix.shape)
|
854
|
+
|
855
|
+
# Backtrack the score matrix.
|
856
|
+
while score_matrix[i, j] != 0:
|
857
|
+
# Get the scores of the three possible paths.
|
858
|
+
match_score = score_matrix[i - 1, j - 1] + self.get_match_weight(str1[i - 1], str2[j - 1])
|
859
|
+
delete_score = score_matrix[i - 1, j] + self.get_gap_weight(str1[i - 1])
|
860
|
+
insert_score = score_matrix[i, j - 1] + self.get_gap_weight(str2[j - 1])
|
861
|
+
|
862
|
+
# Get the maximum score.
|
863
|
+
max_score = max(match_score, delete_score, insert_score)
|
864
|
+
|
865
|
+
# Backtrack the score matrix.
|
866
|
+
if max_score == match_score:
|
867
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i - 1], str2[j - 1])
|
868
|
+
i -= 1
|
869
|
+
j -= 1
|
870
|
+
elif max_score == delete_score:
|
871
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(str1[i - 1], self.gap_char)
|
872
|
+
i -= 1
|
873
|
+
elif max_score == insert_score:
|
874
|
+
insert_str1, insert_str2 = self.add_space_to_shorter(self.gap_char, str2[j - 1])
|
875
|
+
j -= 1
|
876
|
+
|
877
|
+
# Add the characters to the aligned strings.
|
878
|
+
aligned_str1 = insert_str1 + ' | ' + aligned_str1
|
879
|
+
aligned_str2 = insert_str2 + ' | ' + aligned_str2
|
880
|
+
|
881
|
+
# Remove the last ' | '.
|
882
|
+
aligned_str1 = aligned_str1[:-3]
|
883
|
+
aligned_str2 = aligned_str2[:-3]
|
884
|
+
|
885
|
+
# Return the aligned substrings.
|
886
|
+
return aligned_str1, aligned_str2
|
887
|
+
|
888
|
+
|
889
|
+
|
890
|
+
|
891
|
+
# Get the alignment of two strings (or list of strings).
|
892
|
+
# Override the get_alignment function of the NeedlemanWunsch class.
|
893
|
+
def get_alignment(self,
|
894
|
+
str1: Union[str, List[str]],
|
895
|
+
str2: Union[str, List[str]],
|
896
|
+
return_score_matrix: bool = False,
|
897
|
+
) -> Tuple[Union[str, List[str]], Union[str, List[str]]]:
|
898
|
+
"""
|
899
|
+
This function overrides the get_alignment function of the NeedlemanWunsch class to get the alignment of two strings (or list of strings) by using the Smith-Waterman algorithm.
|
900
|
+
|
901
|
+
Arguments:
|
902
|
+
str1 (str or list of str): The first string (or list of strings).
|
903
|
+
str2 (str or list of str): The second string (or list of strings).
|
904
|
+
return_score_matrix (bool): Whether to return the score matrix (default: False)
|
905
|
+
|
906
|
+
Returns:
|
907
|
+
The aligned strings as a tuple of two strings (or list of strings). If return_score_matrix is True, the score matrix is also returned.
|
908
|
+
|
909
|
+
.. note::
|
910
|
+
* The Smith-Waterman algorithm is a dynamic programming algorithm that finds the optimal local alignment between two strings (or list of strings).
|
911
|
+
* This function is similar to the get_alignment function in the NeedlemanWunsch class, with two differences. First, the first row and column of the score matrix are initialized to 0. Second, the max function used in the dynamic programming solution is replaced with a max function that defaults to 0, i.e., max_score = max(match_score, delete_score, insert_score, 0).
|
912
|
+
* Despite these differences, the time and space complexity of the Smith-Waterman algorithm remain the same as that of the Needleman-Wunsch algorithm. It should be noted that most of the code in this function is identical to that in the get_alignment function of the NeedlemanWunsch class.
|
913
|
+
"""
|
914
|
+
|
915
|
+
# Lengths of strings str1 and str2, respectively.
|
916
|
+
len1 = len(str1)
|
917
|
+
len2 = len(str2)
|
918
|
+
|
919
|
+
# Initialize the score matrix.
|
920
|
+
score_matrix = np.zeros((len1 + 1, len2 + 1))
|
921
|
+
|
922
|
+
# Initialize the first row and column of the score matrix.
|
923
|
+
# This time the first row and column are initialized to 0.
|
924
|
+
|
925
|
+
# Dynamic programming solution (Needleman-Wunsch algorithm):
|
926
|
+
for i in range(1, len1 + 1):
|
927
|
+
for j in range(1, len2 + 1):
|
928
|
+
# Get the scores of the three possible paths.
|
929
|
+
match_score = score_matrix[i - 1, j - 1] + self.get_match_weight(str1[i - 1], str2[j - 1])
|
930
|
+
delete_score = score_matrix[i - 1, j] + self.get_gap_weight(str1[i - 1])
|
931
|
+
insert_score = score_matrix[i, j - 1] + self.get_gap_weight(str2[j - 1])
|
932
|
+
|
933
|
+
# Get the maximum score.
|
934
|
+
# Note that this is the only difference between the Smith-Waterman algorithm and the Needleman-Wunsch algorithm.
|
935
|
+
# The max function is replaced by the max function with 0 as the default value.
|
936
|
+
max_score = max(match_score, delete_score, insert_score, 0.)
|
937
|
+
|
938
|
+
# Fill the score matrix.
|
939
|
+
score_matrix[i, j] = max_score
|
940
|
+
|
941
|
+
# Get the alignment.
|
942
|
+
aligned_str1, aligned_str2 = self.backtrack(score_matrix, str1, str2)
|
943
|
+
|
944
|
+
# Return the alignment and the score matrix.
|
945
|
+
if return_score_matrix:
|
946
|
+
return aligned_str1, aligned_str2, score_matrix
|
947
|
+
return aligned_str1, aligned_str2
|
948
|
+
|
949
|
+
|
950
|
+
|
951
|
+
# Dynamic time warping (DTW) class.
|
952
|
+
class DTW:
|
953
|
+
def __init__(self) -> None:
|
954
|
+
r"""
|
955
|
+
This function initializes the Dynamic time warping (DTW) class.
|
956
|
+
"""
|
957
|
+
pass
|
958
|
+
|
959
|
+
|
960
|
+
# Get the alignment indices of two sequences (or list of sequences) by using the DTW algorithm.
|
961
|
+
def get_alignment_path(self,
|
962
|
+
sequence1: Union[str, List[str], int, List[int], float, List[float], np.ndarray],
|
963
|
+
sequence2: Union[str, List[str], int, List[int], float, List[float], np.ndarray],
|
964
|
+
distance = 'absolute_difference',
|
965
|
+
p_value: Optional[int] = None,
|
966
|
+
) -> List[Tuple[int, int]]:
|
967
|
+
"""
|
968
|
+
This function gets the alignment indices of two sequences (or list of sequences) by using the DTW algorithm.
|
969
|
+
|
970
|
+
Arguments:
|
971
|
+
sequence1: The first sequence.
|
972
|
+
sequence2: The second sequence.
|
973
|
+
distance (str): The distance function to be used (currently only 'absolute_difference' and 'square_difference' are supported).
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
The path of the alignment as a list of tuples of two integers.
|
977
|
+
|
978
|
+
Raises:
|
979
|
+
TypeError: If the input sequences are not of the same type.
|
980
|
+
ValueError: If the distance function is not supported.
|
981
|
+
|
982
|
+
|
983
|
+
.. note::
|
984
|
+
* The DTW algorithm is a dynamic programming algorithm that finds the optimal alignment between two sequences (or list of sequences).
|
985
|
+
* The time complexity of the DTW algorithm is :math:`O(nm)`, where :math:`n` and :math:`m` are the lengths of the two sequences, respectively.
|
986
|
+
"""
|
987
|
+
|
988
|
+
# First check if both sequences are of the same type.
|
989
|
+
if type(sequence1) != type(sequence2):
|
990
|
+
raise TypeError("Both sequences must be of the same type.")
|
991
|
+
|
992
|
+
# Check if the distance function is supported.
|
993
|
+
if distance not in ['absolute_difference', 'square_difference']:
|
994
|
+
raise ValueError("The distance function must be either 'absolute_difference' or 'square_difference'.")
|
995
|
+
|
996
|
+
# If the sequences are strings or lists of strings, convert them to lists of integers (ASCII codes), in np.ndarray format.
|
997
|
+
if type(sequence1) == str or (type(sequence1) == list and type(sequence1[0]) == str):
|
998
|
+
sequence1 = np.array([ord(char) for char in sequence1])
|
999
|
+
sequence2 = np.array([ord(char) for char in sequence2])
|
1000
|
+
|
1001
|
+
# Get the lengths of the sequences.
|
1002
|
+
len1 = len(sequence1)
|
1003
|
+
len2 = len(sequence2)
|
1004
|
+
|
1005
|
+
# Initialize the DTW distance matrix with infinity values.
|
1006
|
+
distance_matrix = np.full((len1 + 1, len2 + 1), np.inf)
|
1007
|
+
|
1008
|
+
# Initialize the first row and column of the DTW distance matrix with zero.
|
1009
|
+
distance_matrix[0, 0] = 0.
|
1010
|
+
|
1011
|
+
# Fill the DTW distance matrix.
|
1012
|
+
for i in range(1, len1 + 1):
|
1013
|
+
for j in range(1, len2 + 1):
|
1014
|
+
# Get the distance between the two elements.
|
1015
|
+
if distance == 'absolute_difference':
|
1016
|
+
distance = abs(sequence1[i - 1] - sequence2[j - 1])
|
1017
|
+
else:
|
1018
|
+
# distance == 'square_difference'
|
1019
|
+
distance = (sequence1[i - 1] - sequence2[j - 1]) ** 2
|
1020
|
+
|
1021
|
+
# Fill the DTW distance matrix.
|
1022
|
+
distance_matrix[i, j] = distance + min(
|
1023
|
+
distance_matrix[i - 1, j],
|
1024
|
+
distance_matrix[i, j - 1],
|
1025
|
+
distance_matrix[i - 1, j - 1]
|
1026
|
+
)
|
1027
|
+
|
1028
|
+
# Initialize the alignment.
|
1029
|
+
alignment = []
|
1030
|
+
|
1031
|
+
# Get the alignment.
|
1032
|
+
i = len1
|
1033
|
+
j = len2
|
1034
|
+
while i > 0 or j > 0:
|
1035
|
+
alignment.append((i - 1, j - 1))
|
1036
|
+
if i == 0:
|
1037
|
+
j -= 1
|
1038
|
+
elif j == 0:
|
1039
|
+
i -= 1
|
1040
|
+
else:
|
1041
|
+
if distance_matrix[i - 1, j] < distance_matrix[i, j - 1] and distance_matrix[i - 1, j] < distance_matrix[i - 1, j - 1]:
|
1042
|
+
i -= 1
|
1043
|
+
elif distance_matrix[i, j - 1] < distance_matrix[i - 1, j - 1]:
|
1044
|
+
j -= 1
|
1045
|
+
else:
|
1046
|
+
i -= 1
|
1047
|
+
j -= 1
|
1048
|
+
|
1049
|
+
# Reverse the alignment.
|
1050
|
+
alignment.reverse()
|
1051
|
+
|
1052
|
+
# Return the alignment.
|
1053
|
+
return alignment
|
1054
|
+
|
1055
|
+
# Longest common subsequence (LCSubsequenceuence) class
|
1056
|
+
class LongestCommonSubsequence(StringAlignment):
|
1057
|
+
# Initialize the class
|
1058
|
+
def __init__(self,
|
1059
|
+
list_of_list_separator: str = " ## ",
|
1060
|
+
) -> None:
|
1061
|
+
r"""
|
1062
|
+
This function initializes the Longest Common Subsequence (LCSubsequenceuence) class, which inherits from the StringAlignment class.
|
1063
|
+
|
1064
|
+
Longest common subsequence (LCSubsequence) of two strings is a subsequence of maximal length that appears in both of them.
|
1065
|
+
|
1066
|
+
The following recurrence relation can be used to solve the LCSubsequence problem:
|
1067
|
+
|
1068
|
+
.. math::
|
1069
|
+
:nowrap:
|
1070
|
+
|
1071
|
+
\begin{align}
|
1072
|
+
L[i,j] =
|
1073
|
+
\begin{cases}
|
1074
|
+
0 &\text{ if } i=0 \text{ or } j=0\\
|
1075
|
+
L[i-1,j-1]+1 &\text{ if } i,j>0 \text{ and } str1[i]=str2[j]\\
|
1076
|
+
\max(L[i-1,j],L[i,j-1]) &\text{ if } i,j>0 \text{ and } str1[i]\neq str2[j]\\
|
1077
|
+
\end{cases}
|
1078
|
+
\end{align}
|
1079
|
+
|
1080
|
+
where :math:`L[i,j]` denotes the length of the LCSubsequence of the prefixes str1[0:i] and str2[0:j]. The solution to the problem is then given by :math:`L[n,m]`, assuming that str1 and str2 have lengths n and m, respectively.
|
1081
|
+
|
1082
|
+
A dynamic programming solution exists for this problem with a quadratic (i.e., :math:`\mathcal{O}(nm)`) space and time complexity.
|
1083
|
+
|
1084
|
+
If the vocabulary is fixed, LCSubsequence admits a "Four-Russians speedup," which reduces its overall time complexity to subquadratic :math:`\mathcal{O}(n^2/\log n)`, but this algorithm is not yet implemented in this package.
|
1085
|
+
|
1086
|
+
Arguments:
|
1087
|
+
list_of_list_separator (str): Separator to use when the inputs are lists of strings.
|
1088
|
+
"""
|
1089
|
+
# Initialize the StringAlignment class.
|
1090
|
+
super().__init__(match_weight=1, mismatch_weight=0, gap_weight=0)
|
1091
|
+
|
1092
|
+
# Set the list of list separator.
|
1093
|
+
self.list_of_list_separator = list_of_list_separator
|
1094
|
+
|
1095
|
+
|
1096
|
+
|
1097
|
+
# Compute the longest common subsequence between two strings
|
1098
|
+
def compute(self,
|
1099
|
+
str1: Union[str, List[str]],
|
1100
|
+
str2: Union[str, List[str]],
|
1101
|
+
returnCandidates: bool = False,
|
1102
|
+
) -> Tuple[float, Union[List[str], List[List[str]]]]:
|
1103
|
+
"""
|
1104
|
+
This function computes the longest common subsequence between two strings (or lists of strings).
|
1105
|
+
|
1106
|
+
Arguments:
|
1107
|
+
str1 (str or list of str): The first string (or list of strings) to compare.
|
1108
|
+
str2 (str or list of str): The second string (or list of strings).
|
1109
|
+
returnCandidates (bool): Whether to return the candidates for the longest common subsequence (default: False).
|
1110
|
+
|
1111
|
+
Returns:
|
1112
|
+
* If returnCandidates is False, then the length of the longest common subsequence between the two strings.
|
1113
|
+
* If returnCandidates is True, then the set of all candidates for the longest common subsequence is also returned.
|
1114
|
+
|
1115
|
+
.. note::
|
1116
|
+
* Similar to that of the Levenshtein edit distance problem, the dynamic programming solution for the longest common subsequence problem can be further optimized by using the last row of the two-dimensional array L to compute the length of the LCSubsequence. The key idea behind this optimization is that the last row of the array L only depends on the values of the previous row. Therefore, we can store only two rows of the array L at a time and compute the LCSubsequence using these two rows.
|
1117
|
+
* This optimization reduces the space complexity of the algorithm to :math:`\mathcal{O}(m)`, where :math:`m` is the length of the shorter input string. This optimization is particularly useful when one of the input strings is much shorter than the other, as it can significantly reduce the amount of memory required to solve the problem.
|
1118
|
+
"""
|
1119
|
+
# Check whether the inputs are lists of strings.
|
1120
|
+
boolList = False
|
1121
|
+
if isinstance(str1, list) and isinstance(str2, list):
|
1122
|
+
boolList = True
|
1123
|
+
|
1124
|
+
# Lengths of strings str1 and str2, respectively.
|
1125
|
+
n = len(str1)
|
1126
|
+
m = len(str2)
|
1127
|
+
|
1128
|
+
# Initialize the distance matrix.
|
1129
|
+
dist = np.zeros((n + 1, m + 1))
|
1130
|
+
|
1131
|
+
# Dynamic programming solution to the longest common subsequence.
|
1132
|
+
for i in range(1, n + 1):
|
1133
|
+
for j in range(1, m + 1):
|
1134
|
+
# if str1[i-1] == str2[j-1]: # This is the original code. changed: 2023-03-19, 10:05 PM
|
1135
|
+
if self.bool_match(str1[i-1], str2[j-1]):
|
1136
|
+
dist[i, j] = dist[i-1, j-1] + 1
|
1137
|
+
else:
|
1138
|
+
dist[i, j] = max(dist[i-1, j], dist[i, j-1])
|
1139
|
+
|
1140
|
+
# TODO(msuzgun): At the moment, the backtrack function is not optimized and pretty slow. It should be optimized!
|
1141
|
+
def backtrack(i: int, j: int) -> Union[List[str], List[List[str]]]:
|
1142
|
+
"""
|
1143
|
+
This function, which is called recursively and inside the compute function, is used to backtrack the distance matrix to compute the longest common subsequence.
|
1144
|
+
|
1145
|
+
Arguments:
|
1146
|
+
i (int): The row index of the distance matrix.
|
1147
|
+
j (int): The column index of the distance matrix.
|
1148
|
+
|
1149
|
+
Returns:
|
1150
|
+
The set of longest common subsequences between the two strings (or lists of strings).
|
1151
|
+
"""
|
1152
|
+
# If the row or column index is 0, then the longest common subsequence is empty.
|
1153
|
+
if i == 0 or j == 0:
|
1154
|
+
# return [''] if boolList else [] // This is the original code.
|
1155
|
+
return []
|
1156
|
+
|
1157
|
+
# If the characters at the current row and column of the distance matrix are equal, then the current character is part of the longest common subsequence.
|
1158
|
+
# if str1[i-1] == str2[j-1]: # This is the original code. changed: 2023-03-19, 10:05 PM
|
1159
|
+
if self.bool_match(str1[i-1], str2[j-1]):
|
1160
|
+
# insert_elt = str1[i-1] if boolList else str1[i-1] // This is the original code.
|
1161
|
+
insert_elt = [str1[i-1]] if boolList else str1[i-1]
|
1162
|
+
candidates = list(
|
1163
|
+
set(
|
1164
|
+
cartesian_product(
|
1165
|
+
backtrack(i-1, j-1),
|
1166
|
+
insert_elt,
|
1167
|
+
boolList=boolList,
|
1168
|
+
list_of_list_separator=self.list_of_list_separator,
|
1169
|
+
)
|
1170
|
+
)
|
1171
|
+
)
|
1172
|
+
return candidates
|
1173
|
+
|
1174
|
+
# If the characters at the current row and column of the distance matrix are not equal, then the current character is not part of the longest common subsequence.
|
1175
|
+
candidates = []
|
1176
|
+
if dist[i, j-1] >= dist[i-1, j]:
|
1177
|
+
candidates = backtrack(i, j-1)
|
1178
|
+
if dist[i-1, j] >= dist[i, j-1]:
|
1179
|
+
candidates += backtrack(i-1, j)
|
1180
|
+
return list(set(candidates))
|
1181
|
+
|
1182
|
+
# Compute the longest common subsequence.
|
1183
|
+
candidates = None
|
1184
|
+
if returnCandidates:
|
1185
|
+
candidates = backtrack(n, m)
|
1186
|
+
if boolList:
|
1187
|
+
candidates = [
|
1188
|
+
elt.split(self.list_of_list_separator) for elt in candidates
|
1189
|
+
]
|
1190
|
+
return dist[n, m], candidates
|
1191
|
+
|
1192
|
+
|
1193
|
+
|
1194
|
+
|
1195
|
+
# Longest common substring (LCSubstring) class
|
1196
|
+
class LongestCommonSubstring(LongestCommonSubsequence):
|
1197
|
+
# Initialize the class
|
1198
|
+
def __init__(self,
|
1199
|
+
list_of_list_separator: str = " ## ",
|
1200
|
+
) -> None:
|
1201
|
+
r"""
|
1202
|
+
This function initializes the LongestCommonSubstring (LCSubstring) class.
|
1203
|
+
|
1204
|
+
Longest Common Substring (LCSubstring) of two strings is the longest substring that appears in both of them.
|
1205
|
+
|
1206
|
+
The following recurrence relation can be used to solve the LCSubstring problem:
|
1207
|
+
|
1208
|
+
.. math::
|
1209
|
+
:nowrap:
|
1210
|
+
|
1211
|
+
\begin{align}
|
1212
|
+
L[i,j] =
|
1213
|
+
\begin{cases}
|
1214
|
+
0 &\text{ if } i=0 \text{ or } j=0\\
|
1215
|
+
L[i-1,j-1]+1 &\text{ if } i,j>0 \text{ and } str1[i]=str2[j]\\
|
1216
|
+
0 &\text{ if } i,j>0 \text{ and } str1[i]\neq str2[j]\\
|
1217
|
+
\end{cases}
|
1218
|
+
\end{align}
|
1219
|
+
|
1220
|
+
where :math:`L[i,j]` denotes the length of the LCSubstring that ends at indices i and j in str1 and str2, respectively. The solution to the problem is then given by the maximum value of :math:`L[i,j]`, assuming that str1 and str2 have lengths n and m, respectively.
|
1221
|
+
|
1222
|
+
A dynamic programming solution exists for this problem with a quadratic (i.e., :math:`\mathcal{O}(nm)`) space and time complexity.
|
1223
|
+
|
1224
|
+
Arguments:
|
1225
|
+
list_of_list_separator (str): Separator to use when the inputs are lists of strings.
|
1226
|
+
|
1227
|
+
Returns:
|
1228
|
+
None
|
1229
|
+
"""
|
1230
|
+
# Separator to use when the inputs are lists of strings.
|
1231
|
+
super().__init__(list_of_list_separator=list_of_list_separator)
|
1232
|
+
|
1233
|
+
|
1234
|
+
|
1235
|
+
# Compute the longest common substring between two strings
|
1236
|
+
def compute(self,
|
1237
|
+
str1: Union[str, List[str]],
|
1238
|
+
str2: Union[str, List[str]],
|
1239
|
+
returnCandidates: bool = False,
|
1240
|
+
) -> Tuple[float, Union[List[str], List[List[str]]]]:
|
1241
|
+
"""
|
1242
|
+
This function computes the longest common substring between two strings (or lists of strings).
|
1243
|
+
|
1244
|
+
Arguments:
|
1245
|
+
str1 (str or list of str): The first string (or list of strings).
|
1246
|
+
str2 (str or list of str): The second string (or list of strings).
|
1247
|
+
returnCandidates (bool): A boolean flag indicating whether to return the longest common substring as a list of lists.
|
1248
|
+
|
1249
|
+
Returns:
|
1250
|
+
If returnCandidates is False, then the length of the longest common substring between the two strings. If returnCandidates is True, then the set of longest common substrings between the two strings (or lists of strings) is also returned.
|
1251
|
+
|
1252
|
+
|
1253
|
+
.. note::
|
1254
|
+
* There exists a linear-time solution to LCSubstring problem that uses generalized suffix trees.
|
1255
|
+
* As with the longest common subsequence problem, the longest common substring is not unique. It is possible to have multiple substrings with the same maximum length that appear in both strings.
|
1256
|
+
* Similar to the dynamic programming solution for the longest common subsequence problem, the last row of the matrix can also be used to optimize the computation of the longest common substring.
|
1257
|
+
* It's important to note that the longest common substring is different from the longest common subsequence. The longest common substring is a contiguous sequence of symbols that appears in both strings, while the longest common subsequence is a sequence of characters that may not be contiguous.
|
1258
|
+
* The longest common substring is a measure of similarity between two strings and is used in various fields, including computational biology, where it is used to compare DNA sequences. Similarly, it is also used in plagirism detection and other applications.
|
1259
|
+
"""
|
1260
|
+
# Determine whether the inputs are lists of strings.
|
1261
|
+
boolList = False
|
1262
|
+
if isinstance(str1, list) and isinstance(str2, list):
|
1263
|
+
boolList = True
|
1264
|
+
|
1265
|
+
# Lengths of strings str1 and str2, respectively.
|
1266
|
+
n = len(str1)
|
1267
|
+
m = len(str2)
|
1268
|
+
|
1269
|
+
# Initialize the distance matrix.
|
1270
|
+
dist = np.zeros((n + 1, m + 1), dtype=int)
|
1271
|
+
|
1272
|
+
# Initialize the longest common substring length.
|
1273
|
+
longest_common_substring_length = 0
|
1274
|
+
|
1275
|
+
# Initialize the longest common substring candidates.
|
1276
|
+
longest_common_substring_indices = []
|
1277
|
+
|
1278
|
+
# Dynamic programming solution to the longest common substring.
|
1279
|
+
for i in range(1, n + 1):
|
1280
|
+
for j in range(1, m + 1):
|
1281
|
+
# if str1[i-1] == str2[j-1]: # # This is the original code. changed: 2023-03-19, 10:05 PM
|
1282
|
+
if self.bool_match(str1[i-1], str2[j-1]):
|
1283
|
+
dist[i, j] = dist[i-1, j-1] + 1
|
1284
|
+
if dist[i, j] > longest_common_substring_length:
|
1285
|
+
longest_common_substring_length = dist[i, j]
|
1286
|
+
longest_common_substring_indices = [i]
|
1287
|
+
# candidates = [str1[i-longest_common_substring_length:i]]
|
1288
|
+
elif dist[i, j] == longest_common_substring_length:
|
1289
|
+
# candidates.append(str1[i-longest_common_substring_length:i])
|
1290
|
+
longest_common_substring_indices.append(i)
|
1291
|
+
else:
|
1292
|
+
dist[i, j] = 0
|
1293
|
+
|
1294
|
+
# If returnCandidates is True, then additionally return the set of longest common substrings.
|
1295
|
+
if returnCandidates:
|
1296
|
+
longest_common_substring_candidates = [str1[i-longest_common_substring_length:i] for i in longest_common_substring_indices]
|
1297
|
+
if boolList:
|
1298
|
+
# TODO(msuzgun): Double check this. Correct, but there might be a better way to do this.
|
1299
|
+
longest_common_substring_candidates = list(set(
|
1300
|
+
[
|
1301
|
+
f"{self.list_of_list_separator}".join(cand) for cand in longest_common_substring_candidates
|
1302
|
+
]
|
1303
|
+
))
|
1304
|
+
longest_common_substring_candidates = [
|
1305
|
+
cand.split(self.list_of_list_separator) for cand in longest_common_substring_candidates
|
1306
|
+
]
|
1307
|
+
longest_common_substring_candidates = set(tuple(elt) for elt in longest_common_substring_candidates)
|
1308
|
+
else:
|
1309
|
+
longest_common_substring_candidates = list(set(longest_common_substring_candidates))
|
1310
|
+
return longest_common_substring_length, longest_common_substring_candidates
|
1311
|
+
return longest_common_substring_length, None
|
1312
|
+
|
1313
|
+
"""
|
1314
|
+
string2string code ends here
|
1315
|
+
"""
|