cfe_gotoh 0.4.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/cfe_gotoh/cfe_gotoh.cpp +862 -0
- data/ext/cfe_gotoh/extconf.rb +4 -0
- data/lib/cfe_gotoh.rb +413 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 69fd99e925d82fb14d1035e8461fd32d6d83895ae9bf012efea277596772e7ce
|
4
|
+
data.tar.gz: 92b0780c678fa34b4be4d2148df83ed1f60c9d63c15a5742f3c17fa1cb4beb24
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee778affdf1b42aca92b9acbedf6368df7b906a0b5ab6793132b126f04b781baa2a88a04254b0aa9c51a2f2dd3b09b9b0713edbba9ba85db86e26b6eaa0a5745
|
7
|
+
data.tar.gz: 01f949ecf25c278706fe4718b59e9ac3aa6e643d8ab3063a5fa1aba390d7f2fd3b28e7dbe1703fce94a84affd7b6573e1bbac29f117bb3442d7a6ef199fd8cbe
|
@@ -0,0 +1,862 @@
|
|
1
|
+
#include <string>
|
2
|
+
|
3
|
+
#ifdef __PYTHON__
|
4
|
+
#include <Python.h>
|
5
|
+
#else
|
6
|
+
#include "ruby.h"
|
7
|
+
|
8
|
+
#ifndef RSTRING_PTR
|
9
|
+
// Ruby 1.8.5 doesn't include this definition
|
10
|
+
#define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
11
|
+
#endif
|
12
|
+
#endif
|
13
|
+
|
14
|
+
using namespace std;
|
15
|
+
|
16
|
+
/*
|
17
|
+
I think this application should do a complete alignment. Unfortunately aligning is way too slow in perl,
|
18
|
+
so I suspect the alignment, merging and possibly the gap widening should be done in c. Another possibility
|
19
|
+
is to call the C functions from perl, which would simplify things quite a bit! Unfortunately, I'm not entirely
|
20
|
+
confident in that perl can do this seamlessly(unlike nicer languages like ruby and python).
|
21
|
+
*/
|
22
|
+
|
23
|
+
void trim(string* seq);
|
24
|
+
|
25
|
+
static int nucMat[127][127]; // ASCII runs from 0 to 127
|
26
|
+
void init_pairscore(int matchscore, int mismatchPenalty)
|
27
|
+
{
|
28
|
+
for (int i=0; i<127; i++)
|
29
|
+
{
|
30
|
+
for (int j=0; j<127; j++)
|
31
|
+
{
|
32
|
+
if (i==j)
|
33
|
+
{
|
34
|
+
nucMat[i][j]=matchscore;
|
35
|
+
}
|
36
|
+
else
|
37
|
+
{
|
38
|
+
nucMat[i][j]=-mismatchPenalty;
|
39
|
+
// if ((char)i=='N' || (char)i=='n' || (char)j=='N' || (char)j=='n')
|
40
|
+
// {
|
41
|
+
// nucMat[i][j]=-mismatchPenalty;
|
42
|
+
// }
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// adjust naive assignments for case-insensitivity
|
48
|
+
nucMat['a']['A']=nucMat['A']['a']=matchscore;
|
49
|
+
nucMat['c']['C']=nucMat['C']['c']=matchscore;
|
50
|
+
nucMat['g']['G']=nucMat['G']['g']=matchscore;
|
51
|
+
nucMat['t']['T']=nucMat['T']['t']=nucMat['u']['U']=nucMat['U']['u']=matchscore;
|
52
|
+
nucMat['t']['u']=nucMat['t']['U']=nucMat['T']['u']=nucMat['T']['U']=matchscore;
|
53
|
+
nucMat['u']['t']=nucMat['t']['T']=nucMat['U']['t']=nucMat['U']['T']=matchscore;
|
54
|
+
nucMat['N']['N']=nucMat['n']['N']=nucMat['N']['n']=0;
|
55
|
+
|
56
|
+
|
57
|
+
//bi-mixtures
|
58
|
+
nucMat['A']['R']=nucMat['R']['A']=matchscore;
|
59
|
+
nucMat['G']['R']=nucMat['R']['G']=matchscore;
|
60
|
+
|
61
|
+
nucMat['C']['Y']=nucMat['Y']['C']=matchscore;
|
62
|
+
nucMat['T']['Y']=nucMat['Y']['T']=matchscore;
|
63
|
+
|
64
|
+
nucMat['G']['K']=nucMat['K']['G']=matchscore;
|
65
|
+
nucMat['T']['K']=nucMat['K']['T']=matchscore;
|
66
|
+
|
67
|
+
nucMat['C']['M']=nucMat['M']['C']=matchscore;
|
68
|
+
nucMat['A']['M']=nucMat['M']['A']=matchscore;
|
69
|
+
|
70
|
+
nucMat['C']['S']=nucMat['S']['C']=matchscore;
|
71
|
+
nucMat['G']['S']=nucMat['S']['G']=matchscore;
|
72
|
+
|
73
|
+
nucMat['T']['W']=nucMat['W']['T']=matchscore;
|
74
|
+
nucMat['A']['W']=nucMat['W']['A']=matchscore;
|
75
|
+
|
76
|
+
//tri-mixtures
|
77
|
+
nucMat['C']['B']=nucMat['B']['C']=matchscore;
|
78
|
+
nucMat['G']['B']=nucMat['B']['G']=matchscore;
|
79
|
+
nucMat['T']['B']=nucMat['B']['T']=matchscore;
|
80
|
+
|
81
|
+
nucMat['A']['D']=nucMat['D']['A']=matchscore;
|
82
|
+
nucMat['G']['D']=nucMat['D']['G']=matchscore;
|
83
|
+
nucMat['T']['D']=nucMat['D']['T']=matchscore;
|
84
|
+
|
85
|
+
nucMat['A']['H']=nucMat['H']['A']=matchscore;
|
86
|
+
nucMat['C']['H']=nucMat['H']['C']=matchscore;
|
87
|
+
nucMat['T']['H']=nucMat['H']['T']=matchscore;
|
88
|
+
|
89
|
+
nucMat['A']['V']=nucMat['V']['A']=matchscore;
|
90
|
+
nucMat['C']['V']=nucMat['V']['C']=matchscore;
|
91
|
+
nucMat['G']['V']=nucMat['V']['G']=matchscore;
|
92
|
+
|
93
|
+
//Wild cards
|
94
|
+
nucMat['*']['A']=nucMat['*']['a']=nucMat['A']['*']=nucMat['a']['*']=matchscore;
|
95
|
+
nucMat['*']['C']=nucMat['*']['c']=nucMat['C']['*']=nucMat['c']['*']=matchscore;
|
96
|
+
nucMat['*']['T']=nucMat['*']['t']=nucMat['T']['*']=nucMat['t']['*']=matchscore;
|
97
|
+
nucMat['*']['G']=nucMat['*']['g']=nucMat['G']['*']=nucMat['g']['*']=matchscore;
|
98
|
+
|
99
|
+
nucMat['$']['$']=50;
|
100
|
+
// nucMat['$']['A']=nucMat['$']['a']=nucMat['A']['$']=nucMat['a']['$']=0;
|
101
|
+
// nucMat['$']['T']=nucMat['$']['t']=nucMat['T']['$']=nucMat['t']['$']=0;
|
102
|
+
// nucMat['$']['G']=nucMat['$']['g']=nucMat['G']['$']=nucMat['g']['$']=0;
|
103
|
+
|
104
|
+
//For those annoying duplicate phred values.
|
105
|
+
nucMat['.']['A']=nucMat['.']['a']=nucMat['A']['.']=nucMat['a']['.']=-20;
|
106
|
+
nucMat['.']['C']=nucMat['.']['c']=nucMat['C']['.']=nucMat['c']['.']=-20;
|
107
|
+
nucMat['.']['T']=nucMat['.']['t']=nucMat['T']['.']=nucMat['t']['.']=-20;
|
108
|
+
nucMat['.']['G']=nucMat['.']['g']=nucMat['G']['.']=nucMat['g']['.']=-20;
|
109
|
+
|
110
|
+
nucMat['N']['A']=nucMat['N']['a']=nucMat['A']['N']=nucMat['a']['N']=-3;
|
111
|
+
nucMat['N']['C']=nucMat['N']['c']=nucMat['C']['N']=nucMat['c']['N']=-3;
|
112
|
+
nucMat['N']['T']=nucMat['N']['t']=nucMat['T']['N']=nucMat['t']['N']=-3;
|
113
|
+
nucMat['N']['G']=nucMat['N']['g']=nucMat['G']['N']=nucMat['g']['N']=-3;
|
114
|
+
|
115
|
+
//for easy alignment to a standard with gaps
|
116
|
+
nucMat['X']['A']=nucMat['X']['a']=nucMat['A']['X']=nucMat['a']['X']=-6;
|
117
|
+
nucMat['X']['C']=nucMat['X']['c']=nucMat['C']['X']=nucMat['c']['X']=-6;
|
118
|
+
nucMat['X']['T']=nucMat['X']['t']=nucMat['T']['X']=nucMat['t']['X']=-6;
|
119
|
+
nucMat['X']['G']=nucMat['X']['g']=nucMat['G']['X']=nucMat['g']['X']=-6;
|
120
|
+
nucMat['X']['R']=nucMat['X']['r']=nucMat['R']['X']=nucMat['r']['X']=-6;
|
121
|
+
nucMat['X']['Y']=nucMat['X']['y']=nucMat['Y']['X']=nucMat['y']['X']=-6;
|
122
|
+
nucMat['X']['K']=nucMat['X']['k']=nucMat['K']['X']=nucMat['k']['X']=-6;
|
123
|
+
nucMat['X']['M']=nucMat['X']['m']=nucMat['M']['X']=nucMat['m']['X']=-6;
|
124
|
+
nucMat['X']['S']=nucMat['X']['s']=nucMat['S']['X']=nucMat['s']['X']=-6;
|
125
|
+
nucMat['X']['W']=nucMat['X']['w']=nucMat['W']['X']=nucMat['w']['X']=-6;
|
126
|
+
nucMat['X']['B']=nucMat['X']['b']=nucMat['B']['X']=nucMat['b']['X']=-6;
|
127
|
+
nucMat['X']['D']=nucMat['X']['d']=nucMat['D']['X']=nucMat['d']['X']=-6;
|
128
|
+
nucMat['X']['H']=nucMat['X']['h']=nucMat['H']['X']=nucMat['h']['X']=-6;
|
129
|
+
nucMat['X']['V']=nucMat['X']['v']=nucMat['V']['X']=nucMat['v']['X']=-6;
|
130
|
+
nucMat['X']['-']=nucMat['X']['-']=3;
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
void init_pairscore_aa(int matchscore, int mismatchPenalty)
|
135
|
+
{
|
136
|
+
for (int i=0; i<127; i++)
|
137
|
+
{
|
138
|
+
for (int j=0; j<127; j++)
|
139
|
+
{
|
140
|
+
if(i==j)
|
141
|
+
{
|
142
|
+
nucMat[i][j]=matchscore;
|
143
|
+
}
|
144
|
+
else
|
145
|
+
{
|
146
|
+
nucMat[i][j]=-mismatchPenalty;
|
147
|
+
if((char)i=='X' || (char)j=='X')
|
148
|
+
{
|
149
|
+
nucMat[i][j]=-4;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
|
155
|
+
nucMat['Z']['Z']=nucMat['z']['Z']=nucMat['Z']['z']=0;
|
156
|
+
nucMat['X']['-']=nucMat['-']['X']=matchscore;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
/*
|
161
|
+
Empirical score matrix based on 25% divergent HIV sequences
|
162
|
+
See Nickle, David C., et al. "HIV-specific probabilistic models of protein evolution."
|
163
|
+
PLoS One 2.6 (2007): e503.
|
164
|
+
*/
|
165
|
+
static int empirical_hiv25[24][24] = {\
|
166
|
+
{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},\
|
167
|
+
{-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},\
|
168
|
+
{-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},\
|
169
|
+
{-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},\
|
170
|
+
{-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},\
|
171
|
+
{-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},\
|
172
|
+
{-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},\
|
173
|
+
{-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},\
|
174
|
+
{-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},\
|
175
|
+
{-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},\
|
176
|
+
{-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},\
|
177
|
+
{-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},\
|
178
|
+
{-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},\
|
179
|
+
{-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},\
|
180
|
+
{-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},\
|
181
|
+
{-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},\
|
182
|
+
{1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},\
|
183
|
+
{-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},\
|
184
|
+
{-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},\
|
185
|
+
{0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},\
|
186
|
+
{-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},\
|
187
|
+
{-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},\
|
188
|
+
{-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},\
|
189
|
+
{-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
|
190
|
+
|
191
|
+
void init_pairscore_hiv25(void) {
|
192
|
+
// ASCII codes for protein alphabet ARNDCQEGHILKMFPSTWYVBZ?*
|
193
|
+
int aa_to_ascii[24] = { 65, 82, 78, 68, 67, 81, 69, 71, 72, 73, 76, 75, 77, 70, 80, 83, 84,
|
194
|
+
87, 89, 86, 66, 90, 63, 42 };
|
195
|
+
int i2, j2;
|
196
|
+
|
197
|
+
// reset score matrix to be safe
|
198
|
+
for (int i=0; i<127; i++) {
|
199
|
+
for (int j=0; j<127; j++) {
|
200
|
+
nucMat[i][j] = 0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
// map HIV 25% empirical matrix to score matrix
|
205
|
+
for (int i=0; i<24; i++) {
|
206
|
+
i2 = aa_to_ascii[i];
|
207
|
+
for (int j=0; j<24; j++) {
|
208
|
+
j2 = aa_to_ascii[j];
|
209
|
+
// also map to lowercase
|
210
|
+
nucMat[i2+32][j2+32] = nucMat[i2+32][j2] = nucMat[i2][j2+32] = nucMat[i2][j2] = empirical_hiv25[i][j];
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
|
216
|
+
extern int pairscore(char a, char b)
|
217
|
+
{
|
218
|
+
return nucMat[a][b];
|
219
|
+
}
|
220
|
+
|
221
|
+
void reverse(string* seq)
|
222
|
+
{
|
223
|
+
string tmp = "";
|
224
|
+
for(int i = seq->size() - 1; i >= 0; --i)
|
225
|
+
{
|
226
|
+
tmp += (*seq)[i];
|
227
|
+
}
|
228
|
+
*seq = tmp;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
//Error must be somewhere in here. Ug...
|
233
|
+
int align(string* seqa, string* seqb, string* newseqa, string* newseqb,
|
234
|
+
int gip, int gep, int use_terminal_gap_penalty)
|
235
|
+
{
|
236
|
+
/*
|
237
|
+
Pairwise alignment with affine gap penalty.
|
238
|
+
see Gotoh, Osamu. "Optimal alignment between groups of sequences and its application
|
239
|
+
to multiple sequence alignment." Computer applications in the biosciences: CABIOS 9.3
|
240
|
+
(1993): 361-370.
|
241
|
+
|
242
|
+
Gap open and extension penalties [gip] and [gep] are assumed to take positive values.
|
243
|
+
*/
|
244
|
+
|
245
|
+
int M = seqa->size(); // first group of pre-aligned sequences
|
246
|
+
int N = seqb->size(); // second group
|
247
|
+
|
248
|
+
// if empty ref, return seqb as-is, and seqa as gaps of size(seqb)
|
249
|
+
// prevents a buffer overflow in the traceback matrices which assume M>0
|
250
|
+
if (M==0)
|
251
|
+
{
|
252
|
+
int j;
|
253
|
+
int alignment_score=0;
|
254
|
+
for (j=0 ; j < N ; j++)
|
255
|
+
{
|
256
|
+
//skip terminal (whole seq) gap penalties if user specifies this option
|
257
|
+
if (use_terminal_gap_penalty==0) alignment_score += (j==0) ? (gip+gep) : gep ;
|
258
|
+
*newseqa += '-';
|
259
|
+
*newseqb += (*seqb)[j];;
|
260
|
+
}
|
261
|
+
|
262
|
+
return alignment_score;
|
263
|
+
}
|
264
|
+
|
265
|
+
int i, j;
|
266
|
+
|
267
|
+
// not all elements of D, P, and Q need to be stored - vectors are adequate
|
268
|
+
int *SS=new int[N+1]; // D(i, .)
|
269
|
+
int *oldSS=new int[N+1]; // D(i-1, .)
|
270
|
+
int *PP = new int[N+1]; // P(i, .)
|
271
|
+
|
272
|
+
// Gotoh traceback matrices
|
273
|
+
int **piSS = new int*[M+1];
|
274
|
+
int **pjSS = new int*[M+1];
|
275
|
+
|
276
|
+
int u = -gip; // affine gap initiation penalty
|
277
|
+
int v = -gep; // affine gap extension penalty
|
278
|
+
|
279
|
+
int w1 = u + v; // gap weight w_k = v * k + u for k = 1
|
280
|
+
int t = u;
|
281
|
+
int s, q;
|
282
|
+
|
283
|
+
// initialize vectors
|
284
|
+
for (j=0; j<N+1; j++)
|
285
|
+
{
|
286
|
+
SS[j]=0;
|
287
|
+
oldSS[j]=0;
|
288
|
+
PP[j]=0;
|
289
|
+
}
|
290
|
+
|
291
|
+
// initialize traceback matrices
|
292
|
+
piSS[0] = new int[N+1];
|
293
|
+
pjSS[0] = new int[N+1];
|
294
|
+
piSS[1] = new int[N+1];
|
295
|
+
pjSS[1] = new int[N+1];
|
296
|
+
piSS[1][0] = 0;
|
297
|
+
pjSS[1][0] = 0;
|
298
|
+
piSS[0][1] = 0;
|
299
|
+
pjSS[0][1] = 0;
|
300
|
+
|
301
|
+
int maxiS = -100000;
|
302
|
+
int maxjS = -100000;
|
303
|
+
int maxij, maxji;
|
304
|
+
|
305
|
+
for (i=1; i < M+1; i++)
|
306
|
+
{
|
307
|
+
t += v; // update gap extension
|
308
|
+
s = t;
|
309
|
+
SS[0]=0;
|
310
|
+
q = t + u;
|
311
|
+
|
312
|
+
// add new rows
|
313
|
+
if (i>1)
|
314
|
+
{
|
315
|
+
piSS[i] = new int[N+1];
|
316
|
+
pjSS[i] = new int[N+1];
|
317
|
+
}
|
318
|
+
|
319
|
+
for (j = 1; j < N + 1; j++)
|
320
|
+
{
|
321
|
+
// recursive calculation of Q
|
322
|
+
if (q >= s + u )
|
323
|
+
q += v; // extension
|
324
|
+
else
|
325
|
+
q = s + u + v; // open
|
326
|
+
|
327
|
+
// recursive calculation of P
|
328
|
+
if ((oldSS[j] + w1) > (PP[j] + v))
|
329
|
+
PP[j] = oldSS[j] + w1;
|
330
|
+
else
|
331
|
+
PP[j] += v;
|
332
|
+
|
333
|
+
int tmp_pp = PP[j];
|
334
|
+
|
335
|
+
// D(i-1, j-1) + d(a_i, b_j)
|
336
|
+
int pscore = oldSS[j - 1] + pairscore((*seqa)[i - 1], (*seqb)[j - 1]);
|
337
|
+
|
338
|
+
//no idea if this will work, but its supposed to be a stop codon aligner
|
339
|
+
//the bonus is assigned on the last codon, if the codons between don't make a big difference it'll be wrong. Hrm.
|
340
|
+
|
341
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-3] == '$' && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' &&
|
342
|
+
(((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'G') ||
|
343
|
+
((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'A') ||
|
344
|
+
((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'G' && (*seqb)[j-1] == 'A') ))
|
345
|
+
{
|
346
|
+
pscore += 6;
|
347
|
+
}
|
348
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' &&
|
349
|
+
(((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'G') ||
|
350
|
+
((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'A') ||
|
351
|
+
((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'G' && (*seqb)[j-0] == 'A') ))
|
352
|
+
{
|
353
|
+
pscore += 6;
|
354
|
+
}
|
355
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' && (*seqa)[i+1] == '$' &&
|
356
|
+
(((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'G') ||
|
357
|
+
((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'A') ||
|
358
|
+
((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'G' && (*seqb)[j+1] == 'A') ))
|
359
|
+
{
|
360
|
+
pscore += 6;
|
361
|
+
}
|
362
|
+
|
363
|
+
/*
|
364
|
+
D(i,j) = Min { D(i-1, j-1) + d(a_i, b_j), P(i,j), Q(i,j) }
|
365
|
+
where P(i,j) = Min { D(i-k, j) + w_k } for k = 1, .., i
|
366
|
+
and Q(i,j) = Min { D(i, j-k) + w_k } for k = 1, ..., j
|
367
|
+
|
368
|
+
i.e., three options are:
|
369
|
+
1. match/mismatch,
|
370
|
+
2. gap open/extension in sequence (a),
|
371
|
+
3. gap open/extension in sequence (b)
|
372
|
+
|
373
|
+
pscore = D(i-1, j-1) + d(a_i, b_j)
|
374
|
+
tmp_pp = P(i,j)
|
375
|
+
q = Q(i,j)
|
376
|
+
*/
|
377
|
+
|
378
|
+
//maybe just >?
|
379
|
+
if (tmp_pp >= pscore)
|
380
|
+
{
|
381
|
+
if (tmp_pp > q)
|
382
|
+
{
|
383
|
+
// gap open / extension in (a)
|
384
|
+
s = tmp_pp;
|
385
|
+
piSS[i][j] = i - 1;
|
386
|
+
pjSS[i][j] = j;
|
387
|
+
}
|
388
|
+
else // q > tmp_pp > pscore
|
389
|
+
{
|
390
|
+
// gap open / extension in (b)
|
391
|
+
s = q;
|
392
|
+
piSS[i][j] = i;
|
393
|
+
pjSS[i][j] = j - 1;
|
394
|
+
}
|
395
|
+
}
|
396
|
+
else // pscore > tmp_pp)
|
397
|
+
{
|
398
|
+
if (pscore > q)
|
399
|
+
{
|
400
|
+
// match / mismatch
|
401
|
+
s = pscore;
|
402
|
+
piSS[i][j] = i - 1;
|
403
|
+
pjSS[i][j] = j - 1;
|
404
|
+
}
|
405
|
+
else // q > pscore > tmp_pp
|
406
|
+
{
|
407
|
+
// gap open / extension in (b)
|
408
|
+
s = q;
|
409
|
+
piSS[i][j] = i;
|
410
|
+
pjSS[i][j] = j - 1;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
SS[j] = s;
|
415
|
+
|
416
|
+
if (i == M && SS[j] >= maxiS)
|
417
|
+
{
|
418
|
+
maxiS = SS[j];
|
419
|
+
maxij = j;
|
420
|
+
}
|
421
|
+
}
|
422
|
+
|
423
|
+
if (SS[N] >= maxjS)
|
424
|
+
{
|
425
|
+
maxjS = SS[N];
|
426
|
+
maxji = i;
|
427
|
+
}
|
428
|
+
|
429
|
+
for (j = 0; j < N + 1; j++)
|
430
|
+
{
|
431
|
+
oldSS[j] = SS[j];
|
432
|
+
}
|
433
|
+
}
|
434
|
+
|
435
|
+
if (maxij>N)
|
436
|
+
maxij=N;
|
437
|
+
if (maxji>M)
|
438
|
+
maxji=M;
|
439
|
+
if (maxij<0)
|
440
|
+
maxij=0;
|
441
|
+
if (maxji<0)
|
442
|
+
maxji=0;
|
443
|
+
|
444
|
+
//add starting -'s
|
445
|
+
int alignment_score;
|
446
|
+
if (maxiS > maxjS)
|
447
|
+
{
|
448
|
+
alignment_score = maxiS;
|
449
|
+
i = M;
|
450
|
+
j = maxij;
|
451
|
+
for (int kk = N; kk > maxij; kk--)
|
452
|
+
{
|
453
|
+
*newseqb += (*seqb)[kk - 1];
|
454
|
+
*newseqa += '-';
|
455
|
+
}
|
456
|
+
}
|
457
|
+
else
|
458
|
+
{
|
459
|
+
alignment_score = maxjS;
|
460
|
+
i = maxji;
|
461
|
+
j = N;
|
462
|
+
for (int kk = M; kk > maxji; kk--)
|
463
|
+
{
|
464
|
+
*newseqa += (*seqa)[kk - 1];
|
465
|
+
*newseqb += '-';
|
466
|
+
}
|
467
|
+
}
|
468
|
+
|
469
|
+
bool decI = false;
|
470
|
+
bool decJ = false;
|
471
|
+
//inserting -'s in the middle!
|
472
|
+
while(i >= 1 && j >= 1)
|
473
|
+
{
|
474
|
+
decI=false;
|
475
|
+
decJ=false;
|
476
|
+
if (piSS[i][j] < i)
|
477
|
+
{
|
478
|
+
*newseqa += (*seqa)[i - 1];
|
479
|
+
decI = true;
|
480
|
+
}
|
481
|
+
else
|
482
|
+
{
|
483
|
+
*newseqa += '-';
|
484
|
+
}
|
485
|
+
|
486
|
+
if (pjSS[i][j] < j)
|
487
|
+
{
|
488
|
+
*newseqb += (*seqb)[j - 1];
|
489
|
+
decJ=true;
|
490
|
+
}
|
491
|
+
else
|
492
|
+
{
|
493
|
+
*newseqb += '-';
|
494
|
+
}
|
495
|
+
|
496
|
+
if (decI)
|
497
|
+
{
|
498
|
+
i--;
|
499
|
+
}
|
500
|
+
if (decJ)
|
501
|
+
{
|
502
|
+
j--;
|
503
|
+
}
|
504
|
+
}
|
505
|
+
|
506
|
+
//add extra trailing -'s
|
507
|
+
//forgive terminal gap penalties if user specifies this option
|
508
|
+
if (i < j)
|
509
|
+
{
|
510
|
+
for (int jj = j; jj >= 1; jj--)
|
511
|
+
{
|
512
|
+
*newseqb += (*seqb)[jj - 1];
|
513
|
+
*newseqa += '-';
|
514
|
+
if (use_terminal_gap_penalty==0) alignment_score += gep;
|
515
|
+
}
|
516
|
+
if (use_terminal_gap_penalty==0) alignment_score += gip;
|
517
|
+
}
|
518
|
+
else if(i > j)
|
519
|
+
{
|
520
|
+
for (int ii = i; ii >= 1; ii--)
|
521
|
+
{
|
522
|
+
*newseqa += (*seqa)[ii - 1];
|
523
|
+
*newseqb += '-';
|
524
|
+
if (use_terminal_gap_penalty==0) alignment_score += gep;
|
525
|
+
}
|
526
|
+
if (use_terminal_gap_penalty==0) alignment_score += gip;
|
527
|
+
}
|
528
|
+
|
529
|
+
reverse(newseqa);
|
530
|
+
reverse(newseqb);
|
531
|
+
|
532
|
+
for (i = 0; i < M + 1; i++)
|
533
|
+
{
|
534
|
+
delete []piSS[i];
|
535
|
+
delete []pjSS[i];
|
536
|
+
}
|
537
|
+
|
538
|
+
delete []SS;
|
539
|
+
delete []oldSS;
|
540
|
+
delete []piSS;
|
541
|
+
delete []pjSS;
|
542
|
+
delete []PP;
|
543
|
+
return alignment_score;
|
544
|
+
}
|
545
|
+
|
546
|
+
void degap(string* seq)
|
547
|
+
{
|
548
|
+
/*
|
549
|
+
Remove pre-existing gap characters from sequences prior to alignment.
|
550
|
+
*/
|
551
|
+
unsigned int pos = 0;
|
552
|
+
while(pos != -1)
|
553
|
+
{
|
554
|
+
pos = seq->find('-', 0);
|
555
|
+
if(pos != -1)
|
556
|
+
{
|
557
|
+
seq->erase(pos, 1);
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}
|
561
|
+
|
562
|
+
void trim(string* seq)
|
563
|
+
{
|
564
|
+
/*
|
565
|
+
Remove trailing whitespace from sequences.
|
566
|
+
*/
|
567
|
+
while((*seq)[0] == ' ' || (*seq)[0] == '\t' || (*seq)[0] == '\n' || (*seq)[0] == '\r')
|
568
|
+
{
|
569
|
+
seq->erase(0, 1);
|
570
|
+
}
|
571
|
+
|
572
|
+
while((*seq)[seq->size() - 1] == ' ' || (*seq)[seq->size() - 1] == '\t' || (*seq)[seq->size() - 1] == '\n' || (*seq)[seq->size() - 1] == '\r')
|
573
|
+
{
|
574
|
+
seq->erase(seq->size() - 1, 1);
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
|
579
|
+
void widen_gaps(string* seq)
|
580
|
+
{
|
581
|
+
int size = seq->size();
|
582
|
+
for(int i = 0; i < size; i++)
|
583
|
+
{
|
584
|
+
if((*seq)[i] == '-')
|
585
|
+
{ //start searching for gaps to cluster
|
586
|
+
|
587
|
+
//backwards, seqa
|
588
|
+
unsigned int j = i - 1;
|
589
|
+
int letter = (*seq)[j];
|
590
|
+
j--;
|
591
|
+
while(j >= 0)
|
592
|
+
{
|
593
|
+
if((*seq)[j] == '-')
|
594
|
+
{
|
595
|
+
//woo, swap this with i - 1
|
596
|
+
(*seq)[j] = letter;
|
597
|
+
(*seq)[i - 1] = '-';
|
598
|
+
break;
|
599
|
+
}
|
600
|
+
else if((*seq)[j] == letter)
|
601
|
+
{
|
602
|
+
//nothing really
|
603
|
+
}
|
604
|
+
else if((*seq)[j] != letter)
|
605
|
+
{
|
606
|
+
break;
|
607
|
+
}
|
608
|
+
j--;
|
609
|
+
}
|
610
|
+
|
611
|
+
|
612
|
+
//forward, seqa
|
613
|
+
j = i + 1;
|
614
|
+
letter = (*seq)[j];
|
615
|
+
j++;
|
616
|
+
while(j < seq->size())
|
617
|
+
{
|
618
|
+
if((*seq)[j] == '-')
|
619
|
+
{
|
620
|
+
//woo, swap this with i + 1
|
621
|
+
(*seq)[j] = letter;
|
622
|
+
(*seq)[i + 1] = '-';
|
623
|
+
break;
|
624
|
+
}
|
625
|
+
else if((*seq)[j] == letter)
|
626
|
+
{
|
627
|
+
//nothing really
|
628
|
+
}
|
629
|
+
else if((*seq)[j] != letter)
|
630
|
+
{
|
631
|
+
break;
|
632
|
+
}
|
633
|
+
j++;
|
634
|
+
}
|
635
|
+
}
|
636
|
+
}
|
637
|
+
}
|
638
|
+
|
639
|
+
#ifdef __PYTHON__
|
640
|
+
/* Python wrapper functions */
|
641
|
+
static PyObject * align_it(PyObject * self, PyObject * args)
|
642
|
+
{
|
643
|
+
const char * standard;
|
644
|
+
const char * seq;
|
645
|
+
int gap_init_penalty;
|
646
|
+
int gap_extend_penalty;
|
647
|
+
int use_terminal_gap_penalty;
|
648
|
+
int score;
|
649
|
+
|
650
|
+
if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
|
651
|
+
return NULL;
|
652
|
+
}
|
653
|
+
|
654
|
+
init_pairscore(5, 4); // match, mismatch scores +5, -4 respectively (HyPhy defaults)
|
655
|
+
|
656
|
+
string* seqa = new string(standard);
|
657
|
+
string* seqb = new string(seq);
|
658
|
+
trim(seqa);
|
659
|
+
trim(seqb);
|
660
|
+
//degap(seqa);
|
661
|
+
//degap(seqb);
|
662
|
+
string* newseqa = new string();
|
663
|
+
string* newseqb = new string();
|
664
|
+
|
665
|
+
score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
|
666
|
+
|
667
|
+
PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
|
668
|
+
|
669
|
+
delete seqa;
|
670
|
+
delete seqb;
|
671
|
+
delete newseqa;
|
672
|
+
delete newseqb;
|
673
|
+
|
674
|
+
return retval;
|
675
|
+
}
|
676
|
+
|
677
|
+
static PyObject * align_it_rb(PyObject * self, PyObject * args)
|
678
|
+
{
|
679
|
+
// emulate Ruby implementation of align_it
|
680
|
+
const char * standard;
|
681
|
+
const char * seq;
|
682
|
+
int gap_init_penalty;
|
683
|
+
int gap_extend_penalty;
|
684
|
+
|
685
|
+
if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
|
686
|
+
return NULL;
|
687
|
+
}
|
688
|
+
|
689
|
+
init_pairscore(1, 1);
|
690
|
+
|
691
|
+
string* seqa = new string(standard); // reference
|
692
|
+
string* seqb = new string(seq); // query
|
693
|
+
trim(seqa);
|
694
|
+
trim(seqb);
|
695
|
+
degap(seqa);
|
696
|
+
degap(seqb);
|
697
|
+
string* newseqa = new string();
|
698
|
+
string* newseqb = new string();
|
699
|
+
|
700
|
+
align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
|
701
|
+
|
702
|
+
PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
|
703
|
+
delete seqa;
|
704
|
+
delete seqb;
|
705
|
+
delete newseqa;
|
706
|
+
delete newseqb;
|
707
|
+
|
708
|
+
return retval;
|
709
|
+
}
|
710
|
+
|
711
|
+
static PyObject * align_it_aa(PyObject * self, PyObject * args)
|
712
|
+
{
|
713
|
+
const char * standard;
|
714
|
+
const char * seq;
|
715
|
+
int gap_init_penalty;
|
716
|
+
int gap_extend_penalty;
|
717
|
+
int use_terminal_gap_penalty;
|
718
|
+
int score;
|
719
|
+
|
720
|
+
if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
|
721
|
+
return NULL;
|
722
|
+
}
|
723
|
+
|
724
|
+
init_pairscore_hiv25();
|
725
|
+
|
726
|
+
string* seqa = new string(standard); // reference
|
727
|
+
string* seqb = new string(seq); // query
|
728
|
+
trim(seqa);
|
729
|
+
trim(seqb);
|
730
|
+
//degap(seqa); // HyPhy behaviour is to not remove gaps
|
731
|
+
//degap(seqb);
|
732
|
+
string* newseqa = new string();
|
733
|
+
string* newseqb = new string();
|
734
|
+
|
735
|
+
score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
|
736
|
+
|
737
|
+
PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
|
738
|
+
delete seqa;
|
739
|
+
delete seqb;
|
740
|
+
delete newseqa;
|
741
|
+
delete newseqb;
|
742
|
+
|
743
|
+
return retval;
|
744
|
+
}
|
745
|
+
|
746
|
+
static PyObject * align_it_aa_rb(PyObject * self, PyObject * args)
|
747
|
+
{
|
748
|
+
// emulate Ruby implementation of align_it_aa
|
749
|
+
const char * standard;
|
750
|
+
const char * seq;
|
751
|
+
int gap_init_penalty;
|
752
|
+
int gap_extend_penalty;
|
753
|
+
|
754
|
+
if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
|
755
|
+
return NULL;
|
756
|
+
}
|
757
|
+
|
758
|
+
init_pairscore_aa(4, -2);
|
759
|
+
|
760
|
+
string* seqa = new string(standard); // reference
|
761
|
+
string* seqb = new string(seq); // query
|
762
|
+
trim(seqa);
|
763
|
+
trim(seqb);
|
764
|
+
degap(seqa);
|
765
|
+
degap(seqb);
|
766
|
+
string* newseqa = new string();
|
767
|
+
string* newseqb = new string();
|
768
|
+
|
769
|
+
align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
|
770
|
+
|
771
|
+
PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
|
772
|
+
delete seqa;
|
773
|
+
delete seqb;
|
774
|
+
delete newseqa;
|
775
|
+
delete newseqb;
|
776
|
+
|
777
|
+
return retval;
|
778
|
+
}
|
779
|
+
|
780
|
+
static PyMethodDef AlignmentMethods [] =
|
781
|
+
{
|
782
|
+
{"align_it", align_it, METH_VARARGS, "Pairwise alignment of nucleotide sequences."},
|
783
|
+
{"align_it_rb", align_it_rb, METH_VARARGS, "Pairwise alignment of nucleotide sequences using ReCall settings."},
|
784
|
+
{"align_it_aa", align_it_aa, METH_VARARGS, "Pairwise alignment of protein sequences using empirical HIV 25% score matrix."},
|
785
|
+
{"align_it_aa_rb", align_it_aa_rb, METH_VARARGS, "Pairwise alignment of protein sequences using ReCall settings."},
|
786
|
+
{NULL, NULL, 0, NULL}
|
787
|
+
};
|
788
|
+
|
789
|
+
static struct PyModuleDef AlignmentModuleDef = {
|
790
|
+
PyModuleDef_HEAD_INIT,
|
791
|
+
"gotoh",
|
792
|
+
NULL,
|
793
|
+
-1,
|
794
|
+
AlignmentMethods,
|
795
|
+
NULL,
|
796
|
+
NULL,
|
797
|
+
NULL,
|
798
|
+
NULL
|
799
|
+
};
|
800
|
+
|
801
|
+
PyMODINIT_FUNC PyInit_gotoh(void) {
|
802
|
+
return PyModule_Create(&AlignmentModuleDef);
|
803
|
+
}
|
804
|
+
|
805
|
+
#else
|
806
|
+
/* Ruby wrapper functions */
|
807
|
+
extern "C" VALUE align_it(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
|
808
|
+
{
|
809
|
+
init_pairscore(1, 1);
|
810
|
+
|
811
|
+
string* seqa = new string(RSTRING_PTR(standard));
|
812
|
+
string* seqb = new string(RSTRING_PTR(seq));
|
813
|
+
trim(seqa);
|
814
|
+
trim(seqb);
|
815
|
+
degap(seqa);
|
816
|
+
degap(seqb);
|
817
|
+
string* newseqa = new string();
|
818
|
+
string* newseqb = new string();
|
819
|
+
align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
|
820
|
+
|
821
|
+
VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
|
822
|
+
|
823
|
+
delete seqa;
|
824
|
+
delete seqb;
|
825
|
+
delete newseqa;
|
826
|
+
delete newseqb;
|
827
|
+
|
828
|
+
return ret;
|
829
|
+
}
|
830
|
+
|
831
|
+
extern "C" VALUE align_it_aa(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
|
832
|
+
{
|
833
|
+
init_pairscore_aa(4, -2);
|
834
|
+
|
835
|
+
string* seqa = new string(RSTRING_PTR(standard));
|
836
|
+
string* seqb = new string(RSTRING_PTR(seq));
|
837
|
+
trim(seqa);
|
838
|
+
trim(seqb);
|
839
|
+
degap(seqa);
|
840
|
+
degap(seqb);
|
841
|
+
string* newseqa = new string();
|
842
|
+
string* newseqb = new string();
|
843
|
+
align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
|
844
|
+
|
845
|
+
VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
|
846
|
+
|
847
|
+
delete seqa;
|
848
|
+
delete seqb;
|
849
|
+
delete newseqa;
|
850
|
+
delete newseqb;
|
851
|
+
|
852
|
+
return ret;
|
853
|
+
}
|
854
|
+
|
855
|
+
extern "C" void Init_cfe_gotoh()
|
856
|
+
{
|
857
|
+
VALUE gotoh = rb_define_module("CfeGotoh");
|
858
|
+
rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4);
|
859
|
+
rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4);
|
860
|
+
}
|
861
|
+
|
862
|
+
#endif
|
data/lib/cfe_gotoh.rb
ADDED
@@ -0,0 +1,413 @@
|
|
1
|
+
#TODO: Scoring algorithm to improve frame_align?
|
2
|
+
|
3
|
+
require_relative 'cfe_gotoh/cfe_gotoh'
|
4
|
+
|
5
|
+
|
6
|
+
module CfeGotoh
|
7
|
+
class Error < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
class GapMergeError < Error
|
11
|
+
end
|
12
|
+
|
13
|
+
sub_matrix = Array.new(127) {Array.new(127) {-1.0} }
|
14
|
+
['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc|
|
15
|
+
sub_matrix[nuc.ord()][nuc.ord()] = 1.0
|
16
|
+
sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N')
|
17
|
+
end
|
18
|
+
#bi-mixtures
|
19
|
+
sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0
|
20
|
+
sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0
|
21
|
+
sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0
|
22
|
+
sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0
|
23
|
+
sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0
|
24
|
+
sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0
|
25
|
+
sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0
|
26
|
+
sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0
|
27
|
+
sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0
|
28
|
+
sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0
|
29
|
+
sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0
|
30
|
+
sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0
|
31
|
+
#tri-mixtures
|
32
|
+
sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0
|
33
|
+
sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0
|
34
|
+
sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0
|
35
|
+
sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0
|
36
|
+
sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0
|
37
|
+
sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0
|
38
|
+
sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0
|
39
|
+
sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0
|
40
|
+
sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0
|
41
|
+
sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0
|
42
|
+
sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0
|
43
|
+
sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0
|
44
|
+
#other
|
45
|
+
sub_matrix['$'.ord()]['$'.ord()]=50.0
|
46
|
+
sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0
|
47
|
+
sub_matrix['N'.ord()]['N'.ord()] = 0.0
|
48
|
+
sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0
|
49
|
+
['A','T','G','C'].each do |ch|
|
50
|
+
sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0
|
51
|
+
sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7
|
52
|
+
sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0
|
53
|
+
sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0
|
54
|
+
sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0
|
55
|
+
end
|
56
|
+
sub_matrix.each {|column| column.freeze}
|
57
|
+
sub_matrix.freeze
|
58
|
+
|
59
|
+
NUCLEOTIDE_MATRIX = sub_matrix
|
60
|
+
|
61
|
+
def self.score_alignment(standard, query)
|
62
|
+
sc = 0.0
|
63
|
+
0.upto(standard.size() - 1) do |i|
|
64
|
+
sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()]
|
65
|
+
end
|
66
|
+
return sc
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.make_gap_list(seq)
|
70
|
+
list = []
|
71
|
+
cur_ins = nil
|
72
|
+
prev_i = nil
|
73
|
+
0.upto(seq.size() - 1) do |i|
|
74
|
+
if(seq[i,1] == '-')
|
75
|
+
if(prev_i and i == prev_i + 1)
|
76
|
+
cur_ins << i
|
77
|
+
prev_i = i
|
78
|
+
else
|
79
|
+
list << cur_ins if(cur_ins != nil and cur_ins != [])
|
80
|
+
cur_ins = [i]
|
81
|
+
prev_i = i
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
list << cur_ins if(cur_ins != nil and cur_ins != [])
|
86
|
+
return list
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.trim_leading_dashes(standard, query)
|
90
|
+
leading_dashes_match = /^(-+)[^-]/.match(standard)
|
91
|
+
if (leading_dashes_match.nil?)
|
92
|
+
return
|
93
|
+
end
|
94
|
+
leading_dashes = leading_dashes_match[1]
|
95
|
+
standard[0, leading_dashes.size()] = ''
|
96
|
+
query[0, leading_dashes.size()] = ''
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.trim_trailing_dashes(standard, query)
|
100
|
+
trailing_dashes_match = /[^-](-+)$/.match(standard)
|
101
|
+
if (trailing_dashes_match.nil?)
|
102
|
+
return
|
103
|
+
end
|
104
|
+
trailing_dashes = trailing_dashes_match[1]
|
105
|
+
end_of_standard = standard.size() - trailing_dashes.size()
|
106
|
+
standard[end_of_standard, trailing_dashes.size()] = ''
|
107
|
+
query[end_of_standard, trailing_dashes.size()] = ''
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.fix_incomplete_edge_codon(query, side=:leading)
|
111
|
+
edge_idx = 0
|
112
|
+
dash_regex = /^(-+)[^-]/
|
113
|
+
incr = 1
|
114
|
+
if (side != :leading) # fix the trailing edge
|
115
|
+
edge_idx = -1
|
116
|
+
dash_regex = /[^-](-+)$/
|
117
|
+
incr = -1
|
118
|
+
end
|
119
|
+
|
120
|
+
if (query[edge_idx] == '-')
|
121
|
+
dashes = dash_regex.match(query)[1] # we know there will be a match
|
122
|
+
|
123
|
+
# If the length of the dashes aren't a multiple of 3, turn some
|
124
|
+
# of the query characters into dashes to force it to be a full
|
125
|
+
# codon of dashes.
|
126
|
+
if (dashes.size() % 3 >= 1)
|
127
|
+
first_non_dash_idx = dashes.size()
|
128
|
+
if (side != :leading)
|
129
|
+
first_non_dash_idx = query.size() - dashes.size() - 1
|
130
|
+
end
|
131
|
+
query[first_non_dash_idx] = '-'
|
132
|
+
if (dashes.size() % 3 == 1)
|
133
|
+
query[first_non_dash_idx + incr] = '-'
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.merge_insertions_and_deletions_to_fix_oof_sequences(
|
140
|
+
standard,
|
141
|
+
query
|
142
|
+
)
|
143
|
+
# Merge deletions and insertions until the sequences have a cogent length
|
144
|
+
# (i.e. have length divisible by 3). This helps fix poor insertions near
|
145
|
+
# the start of the sequence.
|
146
|
+
raise 'Standard and query should be the same length' if standard.size() != query.size()
|
147
|
+
if(standard.size() % 3 != 0)
|
148
|
+
dex = 0
|
149
|
+
while(dex = standard.index(/-/, dex))
|
150
|
+
[-1, 1, -2, 2].each do |offset| # look one base away, then two bases away
|
151
|
+
if ((dex + offset >= 0) and query[dex + offset] == '-')
|
152
|
+
standard[dex] = ''
|
153
|
+
query[dex + offset] = ''
|
154
|
+
dex = 0
|
155
|
+
break
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Stop if the sequences are now a cogent length.
|
160
|
+
if(standard.size() % 3 == 0)
|
161
|
+
break
|
162
|
+
end
|
163
|
+
dex += 1
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.cluster_gaps(gaps, raise_errors=false)
|
169
|
+
# Merge adjacent gaps if they are not a codon-sized gap.
|
170
|
+
new_gap_list = []
|
171
|
+
gaps.each_with_index do |gap, i|
|
172
|
+
next if(gap.size() == 0) # we already ate this one
|
173
|
+
if(gap.size() % 3 == 0) # this gap is fine!
|
174
|
+
new_gap_list << gap
|
175
|
+
next
|
176
|
+
end
|
177
|
+
|
178
|
+
gap2 = gaps[i + 1] # note: these could be nil, which we test for below
|
179
|
+
gap3 = gaps[i + 2]
|
180
|
+
# Can I merge with the next gap?
|
181
|
+
if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9)
|
182
|
+
if(gap2.size() > gap.size())
|
183
|
+
new_gap_list << ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2
|
184
|
+
else
|
185
|
+
new_gap_list << gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a()
|
186
|
+
end
|
187
|
+
gaps[i + 1] = []
|
188
|
+
# Can I merge with the next two gaps?
|
189
|
+
elsif(
|
190
|
+
gap2 and gap3 and
|
191
|
+
(gap + gap2 + gap3).size() % 3 == 0 and
|
192
|
+
(gap3.first - gap.last) < 12
|
193
|
+
)
|
194
|
+
# Place the gap around the middle of the three merging gaps.
|
195
|
+
new_gap = (
|
196
|
+
((gap2.first - gap.size()) .. gap2.first - 1).to_a() +
|
197
|
+
gap2 +
|
198
|
+
((gap2.last + 1) .. (gap2.last + gap3.size())).to_a()
|
199
|
+
)
|
200
|
+
new_gap_list << new_gap
|
201
|
+
|
202
|
+
gaps[i + 1] = []
|
203
|
+
gaps[i + 2] = []
|
204
|
+
else
|
205
|
+
# We can't merge the gaps; either raise an error or meekly proceed.
|
206
|
+
if (raise_errors)
|
207
|
+
raise GapMergeError
|
208
|
+
else
|
209
|
+
new_gap_list << gap # FIXME this behaviour differs between insertions and deletions
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
return new_gap_list
|
214
|
+
end
|
215
|
+
|
216
|
+
def self.align_gaps_to_frame(gaps, common_gap_locations=nil)
|
217
|
+
# Align gaps to codon boundaries, giving preference to common
|
218
|
+
# gap locations if specified.
|
219
|
+
# Gaps must be listed in ascending order, i.e. from left to right.
|
220
|
+
|
221
|
+
offset = 0 # offset created by previous gaps.
|
222
|
+
gaps.each do |gap|
|
223
|
+
# See if this gap is close to a common gap location (within 3 amino acids).
|
224
|
+
if (!common_gap_locations.nil?)
|
225
|
+
closest_common = common_gap_locations.min() do |a, b|
|
226
|
+
(3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs()
|
227
|
+
end
|
228
|
+
if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9)
|
229
|
+
# Align the gap to this position.
|
230
|
+
new_gap = []
|
231
|
+
0.upto(gap.size() - 1) do |i|
|
232
|
+
new_gap << 3 * closest_common + i + offset
|
233
|
+
end
|
234
|
+
gap.replace(new_gap)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Align the gap to the nearest appropriate frame.
|
239
|
+
# Original comment from Conan: scoring would be good here
|
240
|
+
if(gap[0] % 3 == 1) # set back one base
|
241
|
+
new_gap = []
|
242
|
+
gap.each do |i|
|
243
|
+
new_gap << i - 1
|
244
|
+
end
|
245
|
+
gap.replace(new_gap)
|
246
|
+
elsif(gap[0] % 3 == 2) # set forward one base
|
247
|
+
new_gap = []
|
248
|
+
gap.each do |i|
|
249
|
+
new_gap << i + 1
|
250
|
+
end
|
251
|
+
gap.replace(new_gap)
|
252
|
+
end
|
253
|
+
|
254
|
+
offset += gap.size()
|
255
|
+
end
|
256
|
+
return gaps
|
257
|
+
end
|
258
|
+
|
259
|
+
def self.splice_gaps_into_sequence(seq, gaps)
|
260
|
+
# Place the specified gaps into the sequence. Note that the
|
261
|
+
# gaps are specified by their positions in an *aligned* sequence,
|
262
|
+
# and as such include "offsets" introduced by gaps placed earlier
|
263
|
+
# in the sequence. The gaps must be in left-to-right order.
|
264
|
+
seq = seq.gsub('-','')
|
265
|
+
gaps.each do |gap|
|
266
|
+
gap.each do |i|
|
267
|
+
if(i > seq.size())
|
268
|
+
seq.insert(-1, '-')
|
269
|
+
else
|
270
|
+
seq.insert(i, '-')
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
return seq
|
275
|
+
end
|
276
|
+
|
277
|
+
#common_insert_locations is based on amino acid locations starting at base 0.
|
278
|
+
#Assumes standard in the first base.
|
279
|
+
#Prealign lets you run a lot of the corrections and qc on a already aligned sequence.
|
280
|
+
def self.frame_align(
|
281
|
+
standard,
|
282
|
+
query,
|
283
|
+
gap_init=3,
|
284
|
+
gap_penalty=1,
|
285
|
+
common_insert_locations=nil,
|
286
|
+
trim=false,
|
287
|
+
raise_errors=false,
|
288
|
+
prealigned=false
|
289
|
+
)
|
290
|
+
if (common_insert_locations.nil?)
|
291
|
+
common_insert_locations = []
|
292
|
+
end
|
293
|
+
if(!prealigned)
|
294
|
+
elem = align_it(standard, query, gap_init, gap_penalty)
|
295
|
+
standard = elem[0]
|
296
|
+
query = elem[1]
|
297
|
+
end
|
298
|
+
raise "Standard and query should be the same length" if standard.size() != query.size()
|
299
|
+
|
300
|
+
# Trim leading and trailing dashes if desired.
|
301
|
+
if (trim)
|
302
|
+
trim_leading_dashes(standard, query)
|
303
|
+
trim_trailing_dashes(standard, query)
|
304
|
+
fix_incomplete_edge_codon(query, :leading)
|
305
|
+
fix_incomplete_edge_codon(query, :trailing)
|
306
|
+
end
|
307
|
+
|
308
|
+
merge_insertions_and_deletions_to_fix_oof_sequences(standard, query)
|
309
|
+
|
310
|
+
if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
|
311
|
+
raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3"
|
312
|
+
end
|
313
|
+
if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
|
314
|
+
raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3"
|
315
|
+
end
|
316
|
+
|
317
|
+
# Build the insert/delete lists. These lists look like
|
318
|
+
# [[3,4,5], [9], [11,12]]
|
319
|
+
insert_list = make_gap_list(standard)
|
320
|
+
delete_list = make_gap_list(query)
|
321
|
+
|
322
|
+
# Process the insertions.
|
323
|
+
if(insert_list.size() > 0)
|
324
|
+
new_ins_list = []
|
325
|
+
|
326
|
+
# Step 1: cluster the insertions.
|
327
|
+
begin
|
328
|
+
new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)
|
329
|
+
rescue GapMergeError
|
330
|
+
raise "Cannot frame align insert" if raise_errors
|
331
|
+
end
|
332
|
+
|
333
|
+
# Step 2: frame-align the insertions, shifting things to common insertion
|
334
|
+
# positions where appropriate.
|
335
|
+
align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations)
|
336
|
+
|
337
|
+
# Put the insertions back into the standard.
|
338
|
+
standard = splice_gaps_into_sequence(standard, new_ins_list)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Process the deletions.
|
342
|
+
if(delete_list.size() > 0)
|
343
|
+
new_del_list = []
|
344
|
+
|
345
|
+
# As above, step 1 is to cluster the deletions.
|
346
|
+
# FIXME note that the original code behaved differently between
|
347
|
+
# insertions and deletions; confirm that this is the right
|
348
|
+
# way forward.
|
349
|
+
begin
|
350
|
+
new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors)
|
351
|
+
rescue GapMergeError
|
352
|
+
raise "Cannot frame align deletion" if raise_errors
|
353
|
+
end
|
354
|
+
|
355
|
+
# Again as above, frame-align the deletions; this time
|
356
|
+
# we don't worry about any common deletion positions.
|
357
|
+
align_gaps_to_frame(new_del_list)
|
358
|
+
|
359
|
+
# Put the deletions back into the query.
|
360
|
+
query = splice_gaps_into_sequence(query, new_del_list)
|
361
|
+
end
|
362
|
+
|
363
|
+
return [standard, query]
|
364
|
+
end
|
365
|
+
|
366
|
+
#Returns a [seq_sans_inserts, [list of inserts]]
|
367
|
+
def self.remove_inserts(elem)
|
368
|
+
return remove_insertions_from_query(elem[0], elem[1])
|
369
|
+
end
|
370
|
+
|
371
|
+
def self.remove_insertions_from_query(standard, query)
|
372
|
+
seq = '' + query
|
373
|
+
inserts = []
|
374
|
+
|
375
|
+
insert_list = []
|
376
|
+
0.upto(standard.size() - 1) do |i|
|
377
|
+
insert_list << i if(standard[i,1] == '-')
|
378
|
+
end
|
379
|
+
|
380
|
+
big_insert_list = []
|
381
|
+
if(standard.include?('-'))#Inserts first
|
382
|
+
#First step should be to cluster inserts
|
383
|
+
cur_ins = nil
|
384
|
+
prev_i = nil
|
385
|
+
insert_list.each do |i|
|
386
|
+
if(prev_i and i == prev_i + 1)
|
387
|
+
cur_ins << i
|
388
|
+
prev_i = i
|
389
|
+
else
|
390
|
+
big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
|
391
|
+
cur_ins = [i]
|
392
|
+
prev_i = i
|
393
|
+
end
|
394
|
+
end
|
395
|
+
big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
|
396
|
+
end
|
397
|
+
|
398
|
+
offset = 0
|
399
|
+
big_insert_list.each do |ins|
|
400
|
+
ins_seq = ''
|
401
|
+
ins.each do |i|
|
402
|
+
ins_seq += query[i,1]
|
403
|
+
end
|
404
|
+
inserts << [((ins[0] - offset) / 3), ins_seq]
|
405
|
+
offset += ins.size()
|
406
|
+
ins.each do |i|
|
407
|
+
seq[i,1] = '.'
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
return [seq.gsub('.',''), inserts]
|
412
|
+
end
|
413
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cfe_gotoh
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0.pre
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Conan Woods
|
8
|
+
- Jamie Kai
|
9
|
+
- David Rickett
|
10
|
+
- Richard Liang
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2024-11-22 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
executables: []
|
19
|
+
extensions:
|
20
|
+
- ext/cfe_gotoh/extconf.rb
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- ext/cfe_gotoh/cfe_gotoh.cpp
|
24
|
+
- ext/cfe_gotoh/extconf.rb
|
25
|
+
- lib/cfe_gotoh.rb
|
26
|
+
homepage:
|
27
|
+
licenses: []
|
28
|
+
metadata:
|
29
|
+
github_repo: ssh://github.com/cfe-lab/gotoh
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.3.1
|
44
|
+
requirements: []
|
45
|
+
rubygems_version: 3.0.9
|
46
|
+
signing_key:
|
47
|
+
specification_version: 4
|
48
|
+
summary: CfE implementation of the Gotoh sequence alignment algorithm
|
49
|
+
test_files: []
|