cfe_gotoh 0.4.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/cfe_gotoh/cfe_gotoh.cpp +862 -0
- data/ext/cfe_gotoh/extconf.rb +4 -0
- data/lib/cfe_gotoh.rb +413 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 69fd99e925d82fb14d1035e8461fd32d6d83895ae9bf012efea277596772e7ce
|
4
|
+
data.tar.gz: 92b0780c678fa34b4be4d2148df83ed1f60c9d63c15a5742f3c17fa1cb4beb24
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee778affdf1b42aca92b9acbedf6368df7b906a0b5ab6793132b126f04b781baa2a88a04254b0aa9c51a2f2dd3b09b9b0713edbba9ba85db86e26b6eaa0a5745
|
7
|
+
data.tar.gz: 01f949ecf25c278706fe4718b59e9ac3aa6e643d8ab3063a5fa1aba390d7f2fd3b28e7dbe1703fce94a84affd7b6573e1bbac29f117bb3442d7a6ef199fd8cbe
|
@@ -0,0 +1,862 @@
|
|
1
|
+
#include <string>
|
2
|
+
|
3
|
+
#ifdef __PYTHON__
|
4
|
+
#include <Python.h>
|
5
|
+
#else
|
6
|
+
#include "ruby.h"
|
7
|
+
|
8
|
+
#ifndef RSTRING_PTR
|
9
|
+
// Ruby 1.8.5 doesn't include this definition
|
10
|
+
#define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
11
|
+
#endif
|
12
|
+
#endif
|
13
|
+
|
14
|
+
using namespace std;
|
15
|
+
|
16
|
+
/*
|
17
|
+
I think this application should do a complete alignment. Unfortunately aligning is way too slow in perl,
|
18
|
+
so I suspect the alignment, merging and possibly the gap widening should be done in c. Another possibility
|
19
|
+
is to call the C functions from perl, which would simplify things quite a bit! Unfortunately, I'm not entirely
|
20
|
+
confident in that perl can do this seamlessly(unlike nicer languages like ruby and python).
|
21
|
+
*/
|
22
|
+
|
23
|
+
void trim(string* seq);
|
24
|
+
|
25
|
+
static int nucMat[127][127]; // ASCII runs from 0 to 127
|
26
|
+
void init_pairscore(int matchscore, int mismatchPenalty)
|
27
|
+
{
|
28
|
+
for (int i=0; i<127; i++)
|
29
|
+
{
|
30
|
+
for (int j=0; j<127; j++)
|
31
|
+
{
|
32
|
+
if (i==j)
|
33
|
+
{
|
34
|
+
nucMat[i][j]=matchscore;
|
35
|
+
}
|
36
|
+
else
|
37
|
+
{
|
38
|
+
nucMat[i][j]=-mismatchPenalty;
|
39
|
+
// if ((char)i=='N' || (char)i=='n' || (char)j=='N' || (char)j=='n')
|
40
|
+
// {
|
41
|
+
// nucMat[i][j]=-mismatchPenalty;
|
42
|
+
// }
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// adjust naive assignments for case-insensitivity
|
48
|
+
nucMat['a']['A']=nucMat['A']['a']=matchscore;
|
49
|
+
nucMat['c']['C']=nucMat['C']['c']=matchscore;
|
50
|
+
nucMat['g']['G']=nucMat['G']['g']=matchscore;
|
51
|
+
nucMat['t']['T']=nucMat['T']['t']=nucMat['u']['U']=nucMat['U']['u']=matchscore;
|
52
|
+
nucMat['t']['u']=nucMat['t']['U']=nucMat['T']['u']=nucMat['T']['U']=matchscore;
|
53
|
+
nucMat['u']['t']=nucMat['t']['T']=nucMat['U']['t']=nucMat['U']['T']=matchscore;
|
54
|
+
nucMat['N']['N']=nucMat['n']['N']=nucMat['N']['n']=0;
|
55
|
+
|
56
|
+
|
57
|
+
//bi-mixtures
|
58
|
+
nucMat['A']['R']=nucMat['R']['A']=matchscore;
|
59
|
+
nucMat['G']['R']=nucMat['R']['G']=matchscore;
|
60
|
+
|
61
|
+
nucMat['C']['Y']=nucMat['Y']['C']=matchscore;
|
62
|
+
nucMat['T']['Y']=nucMat['Y']['T']=matchscore;
|
63
|
+
|
64
|
+
nucMat['G']['K']=nucMat['K']['G']=matchscore;
|
65
|
+
nucMat['T']['K']=nucMat['K']['T']=matchscore;
|
66
|
+
|
67
|
+
nucMat['C']['M']=nucMat['M']['C']=matchscore;
|
68
|
+
nucMat['A']['M']=nucMat['M']['A']=matchscore;
|
69
|
+
|
70
|
+
nucMat['C']['S']=nucMat['S']['C']=matchscore;
|
71
|
+
nucMat['G']['S']=nucMat['S']['G']=matchscore;
|
72
|
+
|
73
|
+
nucMat['T']['W']=nucMat['W']['T']=matchscore;
|
74
|
+
nucMat['A']['W']=nucMat['W']['A']=matchscore;
|
75
|
+
|
76
|
+
//tri-mixtures
|
77
|
+
nucMat['C']['B']=nucMat['B']['C']=matchscore;
|
78
|
+
nucMat['G']['B']=nucMat['B']['G']=matchscore;
|
79
|
+
nucMat['T']['B']=nucMat['B']['T']=matchscore;
|
80
|
+
|
81
|
+
nucMat['A']['D']=nucMat['D']['A']=matchscore;
|
82
|
+
nucMat['G']['D']=nucMat['D']['G']=matchscore;
|
83
|
+
nucMat['T']['D']=nucMat['D']['T']=matchscore;
|
84
|
+
|
85
|
+
nucMat['A']['H']=nucMat['H']['A']=matchscore;
|
86
|
+
nucMat['C']['H']=nucMat['H']['C']=matchscore;
|
87
|
+
nucMat['T']['H']=nucMat['H']['T']=matchscore;
|
88
|
+
|
89
|
+
nucMat['A']['V']=nucMat['V']['A']=matchscore;
|
90
|
+
nucMat['C']['V']=nucMat['V']['C']=matchscore;
|
91
|
+
nucMat['G']['V']=nucMat['V']['G']=matchscore;
|
92
|
+
|
93
|
+
//Wild cards
|
94
|
+
nucMat['*']['A']=nucMat['*']['a']=nucMat['A']['*']=nucMat['a']['*']=matchscore;
|
95
|
+
nucMat['*']['C']=nucMat['*']['c']=nucMat['C']['*']=nucMat['c']['*']=matchscore;
|
96
|
+
nucMat['*']['T']=nucMat['*']['t']=nucMat['T']['*']=nucMat['t']['*']=matchscore;
|
97
|
+
nucMat['*']['G']=nucMat['*']['g']=nucMat['G']['*']=nucMat['g']['*']=matchscore;
|
98
|
+
|
99
|
+
nucMat['$']['$']=50;
|
100
|
+
// nucMat['$']['A']=nucMat['$']['a']=nucMat['A']['$']=nucMat['a']['$']=0;
|
101
|
+
// nucMat['$']['T']=nucMat['$']['t']=nucMat['T']['$']=nucMat['t']['$']=0;
|
102
|
+
// nucMat['$']['G']=nucMat['$']['g']=nucMat['G']['$']=nucMat['g']['$']=0;
|
103
|
+
|
104
|
+
//For those annoying duplicate phred values.
|
105
|
+
nucMat['.']['A']=nucMat['.']['a']=nucMat['A']['.']=nucMat['a']['.']=-20;
|
106
|
+
nucMat['.']['C']=nucMat['.']['c']=nucMat['C']['.']=nucMat['c']['.']=-20;
|
107
|
+
nucMat['.']['T']=nucMat['.']['t']=nucMat['T']['.']=nucMat['t']['.']=-20;
|
108
|
+
nucMat['.']['G']=nucMat['.']['g']=nucMat['G']['.']=nucMat['g']['.']=-20;
|
109
|
+
|
110
|
+
nucMat['N']['A']=nucMat['N']['a']=nucMat['A']['N']=nucMat['a']['N']=-3;
|
111
|
+
nucMat['N']['C']=nucMat['N']['c']=nucMat['C']['N']=nucMat['c']['N']=-3;
|
112
|
+
nucMat['N']['T']=nucMat['N']['t']=nucMat['T']['N']=nucMat['t']['N']=-3;
|
113
|
+
nucMat['N']['G']=nucMat['N']['g']=nucMat['G']['N']=nucMat['g']['N']=-3;
|
114
|
+
|
115
|
+
//for easy alignment to a standard with gaps
|
116
|
+
nucMat['X']['A']=nucMat['X']['a']=nucMat['A']['X']=nucMat['a']['X']=-6;
|
117
|
+
nucMat['X']['C']=nucMat['X']['c']=nucMat['C']['X']=nucMat['c']['X']=-6;
|
118
|
+
nucMat['X']['T']=nucMat['X']['t']=nucMat['T']['X']=nucMat['t']['X']=-6;
|
119
|
+
nucMat['X']['G']=nucMat['X']['g']=nucMat['G']['X']=nucMat['g']['X']=-6;
|
120
|
+
nucMat['X']['R']=nucMat['X']['r']=nucMat['R']['X']=nucMat['r']['X']=-6;
|
121
|
+
nucMat['X']['Y']=nucMat['X']['y']=nucMat['Y']['X']=nucMat['y']['X']=-6;
|
122
|
+
nucMat['X']['K']=nucMat['X']['k']=nucMat['K']['X']=nucMat['k']['X']=-6;
|
123
|
+
nucMat['X']['M']=nucMat['X']['m']=nucMat['M']['X']=nucMat['m']['X']=-6;
|
124
|
+
nucMat['X']['S']=nucMat['X']['s']=nucMat['S']['X']=nucMat['s']['X']=-6;
|
125
|
+
nucMat['X']['W']=nucMat['X']['w']=nucMat['W']['X']=nucMat['w']['X']=-6;
|
126
|
+
nucMat['X']['B']=nucMat['X']['b']=nucMat['B']['X']=nucMat['b']['X']=-6;
|
127
|
+
nucMat['X']['D']=nucMat['X']['d']=nucMat['D']['X']=nucMat['d']['X']=-6;
|
128
|
+
nucMat['X']['H']=nucMat['X']['h']=nucMat['H']['X']=nucMat['h']['X']=-6;
|
129
|
+
nucMat['X']['V']=nucMat['X']['v']=nucMat['V']['X']=nucMat['v']['X']=-6;
|
130
|
+
nucMat['X']['-']=nucMat['X']['-']=3;
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
void init_pairscore_aa(int matchscore, int mismatchPenalty)
|
135
|
+
{
|
136
|
+
for (int i=0; i<127; i++)
|
137
|
+
{
|
138
|
+
for (int j=0; j<127; j++)
|
139
|
+
{
|
140
|
+
if(i==j)
|
141
|
+
{
|
142
|
+
nucMat[i][j]=matchscore;
|
143
|
+
}
|
144
|
+
else
|
145
|
+
{
|
146
|
+
nucMat[i][j]=-mismatchPenalty;
|
147
|
+
if((char)i=='X' || (char)j=='X')
|
148
|
+
{
|
149
|
+
nucMat[i][j]=-4;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
|
155
|
+
nucMat['Z']['Z']=nucMat['z']['Z']=nucMat['Z']['z']=0;
|
156
|
+
nucMat['X']['-']=nucMat['-']['X']=matchscore;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
/*
|
161
|
+
Empirical score matrix based on 25% divergent HIV sequences
|
162
|
+
See Nickle, David C., et al. "HIV-specific probabilistic models of protein evolution."
|
163
|
+
PLoS One 2.6 (2007): e503.
|
164
|
+
*/
|
165
|
+
static int empirical_hiv25[24][24] = {\
|
166
|
+
{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},\
|
167
|
+
{-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},\
|
168
|
+
{-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},\
|
169
|
+
{-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},\
|
170
|
+
{-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},\
|
171
|
+
{-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},\
|
172
|
+
{-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},\
|
173
|
+
{-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},\
|
174
|
+
{-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},\
|
175
|
+
{-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},\
|
176
|
+
{-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},\
|
177
|
+
{-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},\
|
178
|
+
{-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},\
|
179
|
+
{-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},\
|
180
|
+
{-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},\
|
181
|
+
{-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},\
|
182
|
+
{1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},\
|
183
|
+
{-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},\
|
184
|
+
{-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},\
|
185
|
+
{0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},\
|
186
|
+
{-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},\
|
187
|
+
{-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},\
|
188
|
+
{-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},\
|
189
|
+
{-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
|
190
|
+
|
191
|
+
void init_pairscore_hiv25(void) {
|
192
|
+
// ASCII codes for protein alphabet ARNDCQEGHILKMFPSTWYVBZ?*
|
193
|
+
int aa_to_ascii[24] = { 65, 82, 78, 68, 67, 81, 69, 71, 72, 73, 76, 75, 77, 70, 80, 83, 84,
|
194
|
+
87, 89, 86, 66, 90, 63, 42 };
|
195
|
+
int i2, j2;
|
196
|
+
|
197
|
+
// reset score matrix to be safe
|
198
|
+
for (int i=0; i<127; i++) {
|
199
|
+
for (int j=0; j<127; j++) {
|
200
|
+
nucMat[i][j] = 0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
// map HIV 25% empirical matrix to score matrix
|
205
|
+
for (int i=0; i<24; i++) {
|
206
|
+
i2 = aa_to_ascii[i];
|
207
|
+
for (int j=0; j<24; j++) {
|
208
|
+
j2 = aa_to_ascii[j];
|
209
|
+
// also map to lowercase
|
210
|
+
nucMat[i2+32][j2+32] = nucMat[i2+32][j2] = nucMat[i2][j2+32] = nucMat[i2][j2] = empirical_hiv25[i][j];
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
|
216
|
+
extern int pairscore(char a, char b)
|
217
|
+
{
|
218
|
+
return nucMat[a][b];
|
219
|
+
}
|
220
|
+
|
221
|
+
void reverse(string* seq)
|
222
|
+
{
|
223
|
+
string tmp = "";
|
224
|
+
for(int i = seq->size() - 1; i >= 0; --i)
|
225
|
+
{
|
226
|
+
tmp += (*seq)[i];
|
227
|
+
}
|
228
|
+
*seq = tmp;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
//Error must be somewhere in here. Ug...
|
233
|
+
int align(string* seqa, string* seqb, string* newseqa, string* newseqb,
|
234
|
+
int gip, int gep, int use_terminal_gap_penalty)
|
235
|
+
{
|
236
|
+
/*
|
237
|
+
Pairwise alignment with affine gap penalty.
|
238
|
+
see Gotoh, Osamu. "Optimal alignment between groups of sequences and its application
|
239
|
+
to multiple sequence alignment." Computer applications in the biosciences: CABIOS 9.3
|
240
|
+
(1993): 361-370.
|
241
|
+
|
242
|
+
Gap open and extension penalties [gip] and [gep] are assumed to take positive values.
|
243
|
+
*/
|
244
|
+
|
245
|
+
int M = seqa->size(); // first group of pre-aligned sequences
|
246
|
+
int N = seqb->size(); // second group
|
247
|
+
|
248
|
+
// if empty ref, return seqb as-is, and seqa as gaps of size(seqb)
|
249
|
+
// prevents a buffer overflow in the traceback matrices which assume M>0
|
250
|
+
if (M==0)
|
251
|
+
{
|
252
|
+
int j;
|
253
|
+
int alignment_score=0;
|
254
|
+
for (j=0 ; j < N ; j++)
|
255
|
+
{
|
256
|
+
//skip terminal (whole seq) gap penalties if user specifies this option
|
257
|
+
if (use_terminal_gap_penalty==0) alignment_score += (j==0) ? (gip+gep) : gep ;
|
258
|
+
*newseqa += '-';
|
259
|
+
*newseqb += (*seqb)[j];;
|
260
|
+
}
|
261
|
+
|
262
|
+
return alignment_score;
|
263
|
+
}
|
264
|
+
|
265
|
+
int i, j;
|
266
|
+
|
267
|
+
// not all elements of D, P, and Q need to be stored - vectors are adequate
|
268
|
+
int *SS=new int[N+1]; // D(i, .)
|
269
|
+
int *oldSS=new int[N+1]; // D(i-1, .)
|
270
|
+
int *PP = new int[N+1]; // P(i, .)
|
271
|
+
|
272
|
+
// Gotoh traceback matrices
|
273
|
+
int **piSS = new int*[M+1];
|
274
|
+
int **pjSS = new int*[M+1];
|
275
|
+
|
276
|
+
int u = -gip; // affine gap initiation penalty
|
277
|
+
int v = -gep; // affine gap extension penalty
|
278
|
+
|
279
|
+
int w1 = u + v; // gap weight w_k = v * k + u for k = 1
|
280
|
+
int t = u;
|
281
|
+
int s, q;
|
282
|
+
|
283
|
+
// initialize vectors
|
284
|
+
for (j=0; j<N+1; j++)
|
285
|
+
{
|
286
|
+
SS[j]=0;
|
287
|
+
oldSS[j]=0;
|
288
|
+
PP[j]=0;
|
289
|
+
}
|
290
|
+
|
291
|
+
// initialize traceback matrices
|
292
|
+
piSS[0] = new int[N+1];
|
293
|
+
pjSS[0] = new int[N+1];
|
294
|
+
piSS[1] = new int[N+1];
|
295
|
+
pjSS[1] = new int[N+1];
|
296
|
+
piSS[1][0] = 0;
|
297
|
+
pjSS[1][0] = 0;
|
298
|
+
piSS[0][1] = 0;
|
299
|
+
pjSS[0][1] = 0;
|
300
|
+
|
301
|
+
int maxiS = -100000;
|
302
|
+
int maxjS = -100000;
|
303
|
+
int maxij, maxji;
|
304
|
+
|
305
|
+
for (i=1; i < M+1; i++)
|
306
|
+
{
|
307
|
+
t += v; // update gap extension
|
308
|
+
s = t;
|
309
|
+
SS[0]=0;
|
310
|
+
q = t + u;
|
311
|
+
|
312
|
+
// add new rows
|
313
|
+
if (i>1)
|
314
|
+
{
|
315
|
+
piSS[i] = new int[N+1];
|
316
|
+
pjSS[i] = new int[N+1];
|
317
|
+
}
|
318
|
+
|
319
|
+
for (j = 1; j < N + 1; j++)
|
320
|
+
{
|
321
|
+
// recursive calculation of Q
|
322
|
+
if (q >= s + u )
|
323
|
+
q += v; // extension
|
324
|
+
else
|
325
|
+
q = s + u + v; // open
|
326
|
+
|
327
|
+
// recursive calculation of P
|
328
|
+
if ((oldSS[j] + w1) > (PP[j] + v))
|
329
|
+
PP[j] = oldSS[j] + w1;
|
330
|
+
else
|
331
|
+
PP[j] += v;
|
332
|
+
|
333
|
+
int tmp_pp = PP[j];
|
334
|
+
|
335
|
+
// D(i-1, j-1) + d(a_i, b_j)
|
336
|
+
int pscore = oldSS[j - 1] + pairscore((*seqa)[i - 1], (*seqb)[j - 1]);
|
337
|
+
|
338
|
+
//no idea if this will work, but its supposed to be a stop codon aligner
|
339
|
+
//the bonus is assigned on the last codon, if the codons between don't make a big difference it'll be wrong. Hrm.
|
340
|
+
|
341
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-3] == '$' && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' &&
|
342
|
+
(((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'G') ||
|
343
|
+
((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'A') ||
|
344
|
+
((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'G' && (*seqb)[j-1] == 'A') ))
|
345
|
+
{
|
346
|
+
pscore += 6;
|
347
|
+
}
|
348
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' &&
|
349
|
+
(((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'G') ||
|
350
|
+
((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'A') ||
|
351
|
+
((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'G' && (*seqb)[j-0] == 'A') ))
|
352
|
+
{
|
353
|
+
pscore += 6;
|
354
|
+
}
|
355
|
+
if(i >= 3 && j >= 3 && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' && (*seqa)[i+1] == '$' &&
|
356
|
+
(((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'G') ||
|
357
|
+
((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'A') ||
|
358
|
+
((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'G' && (*seqb)[j+1] == 'A') ))
|
359
|
+
{
|
360
|
+
pscore += 6;
|
361
|
+
}
|
362
|
+
|
363
|
+
/*
|
364
|
+
D(i,j) = Min { D(i-1, j-1) + d(a_i, b_j), P(i,j), Q(i,j) }
|
365
|
+
where P(i,j) = Min { D(i-k, j) + w_k } for k = 1, .., i
|
366
|
+
and Q(i,j) = Min { D(i, j-k) + w_k } for k = 1, ..., j
|
367
|
+
|
368
|
+
i.e., three options are:
|
369
|
+
1. match/mismatch,
|
370
|
+
2. gap open/extension in sequence (a),
|
371
|
+
3. gap open/extension in sequence (b)
|
372
|
+
|
373
|
+
pscore = D(i-1, j-1) + d(a_i, b_j)
|
374
|
+
tmp_pp = P(i,j)
|
375
|
+
q = Q(i,j)
|
376
|
+
*/
|
377
|
+
|
378
|
+
//maybe just >?
|
379
|
+
if (tmp_pp >= pscore)
|
380
|
+
{
|
381
|
+
if (tmp_pp > q)
|
382
|
+
{
|
383
|
+
// gap open / extension in (a)
|
384
|
+
s = tmp_pp;
|
385
|
+
piSS[i][j] = i - 1;
|
386
|
+
pjSS[i][j] = j;
|
387
|
+
}
|
388
|
+
else // q > tmp_pp > pscore
|
389
|
+
{
|
390
|
+
// gap open / extension in (b)
|
391
|
+
s = q;
|
392
|
+
piSS[i][j] = i;
|
393
|
+
pjSS[i][j] = j - 1;
|
394
|
+
}
|
395
|
+
}
|
396
|
+
else // pscore > tmp_pp)
|
397
|
+
{
|
398
|
+
if (pscore > q)
|
399
|
+
{
|
400
|
+
// match / mismatch
|
401
|
+
s = pscore;
|
402
|
+
piSS[i][j] = i - 1;
|
403
|
+
pjSS[i][j] = j - 1;
|
404
|
+
}
|
405
|
+
else // q > pscore > tmp_pp
|
406
|
+
{
|
407
|
+
// gap open / extension in (b)
|
408
|
+
s = q;
|
409
|
+
piSS[i][j] = i;
|
410
|
+
pjSS[i][j] = j - 1;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
SS[j] = s;
|
415
|
+
|
416
|
+
if (i == M && SS[j] >= maxiS)
|
417
|
+
{
|
418
|
+
maxiS = SS[j];
|
419
|
+
maxij = j;
|
420
|
+
}
|
421
|
+
}
|
422
|
+
|
423
|
+
if (SS[N] >= maxjS)
|
424
|
+
{
|
425
|
+
maxjS = SS[N];
|
426
|
+
maxji = i;
|
427
|
+
}
|
428
|
+
|
429
|
+
for (j = 0; j < N + 1; j++)
|
430
|
+
{
|
431
|
+
oldSS[j] = SS[j];
|
432
|
+
}
|
433
|
+
}
|
434
|
+
|
435
|
+
if (maxij>N)
|
436
|
+
maxij=N;
|
437
|
+
if (maxji>M)
|
438
|
+
maxji=M;
|
439
|
+
if (maxij<0)
|
440
|
+
maxij=0;
|
441
|
+
if (maxji<0)
|
442
|
+
maxji=0;
|
443
|
+
|
444
|
+
//add starting -'s
|
445
|
+
int alignment_score;
|
446
|
+
if (maxiS > maxjS)
|
447
|
+
{
|
448
|
+
alignment_score = maxiS;
|
449
|
+
i = M;
|
450
|
+
j = maxij;
|
451
|
+
for (int kk = N; kk > maxij; kk--)
|
452
|
+
{
|
453
|
+
*newseqb += (*seqb)[kk - 1];
|
454
|
+
*newseqa += '-';
|
455
|
+
}
|
456
|
+
}
|
457
|
+
else
|
458
|
+
{
|
459
|
+
alignment_score = maxjS;
|
460
|
+
i = maxji;
|
461
|
+
j = N;
|
462
|
+
for (int kk = M; kk > maxji; kk--)
|
463
|
+
{
|
464
|
+
*newseqa += (*seqa)[kk - 1];
|
465
|
+
*newseqb += '-';
|
466
|
+
}
|
467
|
+
}
|
468
|
+
|
469
|
+
bool decI = false;
|
470
|
+
bool decJ = false;
|
471
|
+
//inserting -'s in the middle!
|
472
|
+
while(i >= 1 && j >= 1)
|
473
|
+
{
|
474
|
+
decI=false;
|
475
|
+
decJ=false;
|
476
|
+
if (piSS[i][j] < i)
|
477
|
+
{
|
478
|
+
*newseqa += (*seqa)[i - 1];
|
479
|
+
decI = true;
|
480
|
+
}
|
481
|
+
else
|
482
|
+
{
|
483
|
+
*newseqa += '-';
|
484
|
+
}
|
485
|
+
|
486
|
+
if (pjSS[i][j] < j)
|
487
|
+
{
|
488
|
+
*newseqb += (*seqb)[j - 1];
|
489
|
+
decJ=true;
|
490
|
+
}
|
491
|
+
else
|
492
|
+
{
|
493
|
+
*newseqb += '-';
|
494
|
+
}
|
495
|
+
|
496
|
+
if (decI)
|
497
|
+
{
|
498
|
+
i--;
|
499
|
+
}
|
500
|
+
if (decJ)
|
501
|
+
{
|
502
|
+
j--;
|
503
|
+
}
|
504
|
+
}
|
505
|
+
|
506
|
+
//add extra trailing -'s
|
507
|
+
//forgive terminal gap penalties if user specifies this option
|
508
|
+
if (i < j)
|
509
|
+
{
|
510
|
+
for (int jj = j; jj >= 1; jj--)
|
511
|
+
{
|
512
|
+
*newseqb += (*seqb)[jj - 1];
|
513
|
+
*newseqa += '-';
|
514
|
+
if (use_terminal_gap_penalty==0) alignment_score += gep;
|
515
|
+
}
|
516
|
+
if (use_terminal_gap_penalty==0) alignment_score += gip;
|
517
|
+
}
|
518
|
+
else if(i > j)
|
519
|
+
{
|
520
|
+
for (int ii = i; ii >= 1; ii--)
|
521
|
+
{
|
522
|
+
*newseqa += (*seqa)[ii - 1];
|
523
|
+
*newseqb += '-';
|
524
|
+
if (use_terminal_gap_penalty==0) alignment_score += gep;
|
525
|
+
}
|
526
|
+
if (use_terminal_gap_penalty==0) alignment_score += gip;
|
527
|
+
}
|
528
|
+
|
529
|
+
reverse(newseqa);
|
530
|
+
reverse(newseqb);
|
531
|
+
|
532
|
+
for (i = 0; i < M + 1; i++)
|
533
|
+
{
|
534
|
+
delete []piSS[i];
|
535
|
+
delete []pjSS[i];
|
536
|
+
}
|
537
|
+
|
538
|
+
delete []SS;
|
539
|
+
delete []oldSS;
|
540
|
+
delete []piSS;
|
541
|
+
delete []pjSS;
|
542
|
+
delete []PP;
|
543
|
+
return alignment_score;
|
544
|
+
}
|
545
|
+
|
546
|
+
void degap(string* seq)
|
547
|
+
{
|
548
|
+
/*
|
549
|
+
Remove pre-existing gap characters from sequences prior to alignment.
|
550
|
+
*/
|
551
|
+
unsigned int pos = 0;
|
552
|
+
while(pos != -1)
|
553
|
+
{
|
554
|
+
pos = seq->find('-', 0);
|
555
|
+
if(pos != -1)
|
556
|
+
{
|
557
|
+
seq->erase(pos, 1);
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}
|
561
|
+
|
562
|
+
void trim(string* seq)
|
563
|
+
{
|
564
|
+
/*
|
565
|
+
Remove trailing whitespace from sequences.
|
566
|
+
*/
|
567
|
+
while((*seq)[0] == ' ' || (*seq)[0] == '\t' || (*seq)[0] == '\n' || (*seq)[0] == '\r')
|
568
|
+
{
|
569
|
+
seq->erase(0, 1);
|
570
|
+
}
|
571
|
+
|
572
|
+
while((*seq)[seq->size() - 1] == ' ' || (*seq)[seq->size() - 1] == '\t' || (*seq)[seq->size() - 1] == '\n' || (*seq)[seq->size() - 1] == '\r')
|
573
|
+
{
|
574
|
+
seq->erase(seq->size() - 1, 1);
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
|
579
|
+
void widen_gaps(string* seq)
|
580
|
+
{
|
581
|
+
int size = seq->size();
|
582
|
+
for(int i = 0; i < size; i++)
|
583
|
+
{
|
584
|
+
if((*seq)[i] == '-')
|
585
|
+
{ //start searching for gaps to cluster
|
586
|
+
|
587
|
+
//backwards, seqa
|
588
|
+
unsigned int j = i - 1;
|
589
|
+
int letter = (*seq)[j];
|
590
|
+
j--;
|
591
|
+
while(j >= 0)
|
592
|
+
{
|
593
|
+
if((*seq)[j] == '-')
|
594
|
+
{
|
595
|
+
//woo, swap this with i - 1
|
596
|
+
(*seq)[j] = letter;
|
597
|
+
(*seq)[i - 1] = '-';
|
598
|
+
break;
|
599
|
+
}
|
600
|
+
else if((*seq)[j] == letter)
|
601
|
+
{
|
602
|
+
//nothing really
|
603
|
+
}
|
604
|
+
else if((*seq)[j] != letter)
|
605
|
+
{
|
606
|
+
break;
|
607
|
+
}
|
608
|
+
j--;
|
609
|
+
}
|
610
|
+
|
611
|
+
|
612
|
+
//forward, seqa
|
613
|
+
j = i + 1;
|
614
|
+
letter = (*seq)[j];
|
615
|
+
j++;
|
616
|
+
while(j < seq->size())
|
617
|
+
{
|
618
|
+
if((*seq)[j] == '-')
|
619
|
+
{
|
620
|
+
//woo, swap this with i + 1
|
621
|
+
(*seq)[j] = letter;
|
622
|
+
(*seq)[i + 1] = '-';
|
623
|
+
break;
|
624
|
+
}
|
625
|
+
else if((*seq)[j] == letter)
|
626
|
+
{
|
627
|
+
//nothing really
|
628
|
+
}
|
629
|
+
else if((*seq)[j] != letter)
|
630
|
+
{
|
631
|
+
break;
|
632
|
+
}
|
633
|
+
j++;
|
634
|
+
}
|
635
|
+
}
|
636
|
+
}
|
637
|
+
}
|
638
|
+
|
639
|
+
#ifdef __PYTHON__
|
640
|
+
/* Python wrapper functions */
|
641
|
+
static PyObject * align_it(PyObject * self, PyObject * args)
|
642
|
+
{
|
643
|
+
const char * standard;
|
644
|
+
const char * seq;
|
645
|
+
int gap_init_penalty;
|
646
|
+
int gap_extend_penalty;
|
647
|
+
int use_terminal_gap_penalty;
|
648
|
+
int score;
|
649
|
+
|
650
|
+
if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
|
651
|
+
return NULL;
|
652
|
+
}
|
653
|
+
|
654
|
+
init_pairscore(5, 4); // match, mismatch scores +5, -4 respectively (HyPhy defaults)
|
655
|
+
|
656
|
+
string* seqa = new string(standard);
|
657
|
+
string* seqb = new string(seq);
|
658
|
+
trim(seqa);
|
659
|
+
trim(seqb);
|
660
|
+
//degap(seqa);
|
661
|
+
//degap(seqb);
|
662
|
+
string* newseqa = new string();
|
663
|
+
string* newseqb = new string();
|
664
|
+
|
665
|
+
score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
|
666
|
+
|
667
|
+
PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
|
668
|
+
|
669
|
+
delete seqa;
|
670
|
+
delete seqb;
|
671
|
+
delete newseqa;
|
672
|
+
delete newseqb;
|
673
|
+
|
674
|
+
return retval;
|
675
|
+
}
|
676
|
+
|
677
|
+
static PyObject * align_it_rb(PyObject * self, PyObject * args)
|
678
|
+
{
|
679
|
+
// emulate Ruby implementation of align_it
|
680
|
+
const char * standard;
|
681
|
+
const char * seq;
|
682
|
+
int gap_init_penalty;
|
683
|
+
int gap_extend_penalty;
|
684
|
+
|
685
|
+
if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
|
686
|
+
return NULL;
|
687
|
+
}
|
688
|
+
|
689
|
+
init_pairscore(1, 1);
|
690
|
+
|
691
|
+
string* seqa = new string(standard); // reference
|
692
|
+
string* seqb = new string(seq); // query
|
693
|
+
trim(seqa);
|
694
|
+
trim(seqb);
|
695
|
+
degap(seqa);
|
696
|
+
degap(seqb);
|
697
|
+
string* newseqa = new string();
|
698
|
+
string* newseqb = new string();
|
699
|
+
|
700
|
+
align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
|
701
|
+
|
702
|
+
PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
|
703
|
+
delete seqa;
|
704
|
+
delete seqb;
|
705
|
+
delete newseqa;
|
706
|
+
delete newseqb;
|
707
|
+
|
708
|
+
return retval;
|
709
|
+
}
|
710
|
+
|
711
|
+
static PyObject * align_it_aa(PyObject * self, PyObject * args)
|
712
|
+
{
|
713
|
+
const char * standard;
|
714
|
+
const char * seq;
|
715
|
+
int gap_init_penalty;
|
716
|
+
int gap_extend_penalty;
|
717
|
+
int use_terminal_gap_penalty;
|
718
|
+
int score;
|
719
|
+
|
720
|
+
if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
|
721
|
+
return NULL;
|
722
|
+
}
|
723
|
+
|
724
|
+
init_pairscore_hiv25();
|
725
|
+
|
726
|
+
string* seqa = new string(standard); // reference
|
727
|
+
string* seqb = new string(seq); // query
|
728
|
+
trim(seqa);
|
729
|
+
trim(seqb);
|
730
|
+
//degap(seqa); // HyPhy behaviour is to not remove gaps
|
731
|
+
//degap(seqb);
|
732
|
+
string* newseqa = new string();
|
733
|
+
string* newseqb = new string();
|
734
|
+
|
735
|
+
score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
|
736
|
+
|
737
|
+
PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
|
738
|
+
delete seqa;
|
739
|
+
delete seqb;
|
740
|
+
delete newseqa;
|
741
|
+
delete newseqb;
|
742
|
+
|
743
|
+
return retval;
|
744
|
+
}
|
745
|
+
|
746
|
+
static PyObject * align_it_aa_rb(PyObject * self, PyObject * args)
|
747
|
+
{
|
748
|
+
// emulate Ruby implementation of align_it_aa
|
749
|
+
const char * standard;
|
750
|
+
const char * seq;
|
751
|
+
int gap_init_penalty;
|
752
|
+
int gap_extend_penalty;
|
753
|
+
|
754
|
+
if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
|
755
|
+
return NULL;
|
756
|
+
}
|
757
|
+
|
758
|
+
init_pairscore_aa(4, -2);
|
759
|
+
|
760
|
+
string* seqa = new string(standard); // reference
|
761
|
+
string* seqb = new string(seq); // query
|
762
|
+
trim(seqa);
|
763
|
+
trim(seqb);
|
764
|
+
degap(seqa);
|
765
|
+
degap(seqb);
|
766
|
+
string* newseqa = new string();
|
767
|
+
string* newseqb = new string();
|
768
|
+
|
769
|
+
align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
|
770
|
+
|
771
|
+
PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
|
772
|
+
delete seqa;
|
773
|
+
delete seqb;
|
774
|
+
delete newseqa;
|
775
|
+
delete newseqb;
|
776
|
+
|
777
|
+
return retval;
|
778
|
+
}
|
779
|
+
|
780
|
+
static PyMethodDef AlignmentMethods [] =
|
781
|
+
{
|
782
|
+
{"align_it", align_it, METH_VARARGS, "Pairwise alignment of nucleotide sequences."},
|
783
|
+
{"align_it_rb", align_it_rb, METH_VARARGS, "Pairwise alignment of nucleotide sequences using ReCall settings."},
|
784
|
+
{"align_it_aa", align_it_aa, METH_VARARGS, "Pairwise alignment of protein sequences using empirical HIV 25% score matrix."},
|
785
|
+
{"align_it_aa_rb", align_it_aa_rb, METH_VARARGS, "Pairwise alignment of protein sequences using ReCall settings."},
|
786
|
+
{NULL, NULL, 0, NULL}
|
787
|
+
};
|
788
|
+
|
789
|
+
static struct PyModuleDef AlignmentModuleDef = {
|
790
|
+
PyModuleDef_HEAD_INIT,
|
791
|
+
"gotoh",
|
792
|
+
NULL,
|
793
|
+
-1,
|
794
|
+
AlignmentMethods,
|
795
|
+
NULL,
|
796
|
+
NULL,
|
797
|
+
NULL,
|
798
|
+
NULL
|
799
|
+
};
|
800
|
+
|
801
|
+
PyMODINIT_FUNC PyInit_gotoh(void) {
|
802
|
+
return PyModule_Create(&AlignmentModuleDef);
|
803
|
+
}
|
804
|
+
|
805
|
+
#else
|
806
|
+
/* Ruby wrapper functions */
|
807
|
+
extern "C" VALUE align_it(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
|
808
|
+
{
|
809
|
+
init_pairscore(1, 1);
|
810
|
+
|
811
|
+
string* seqa = new string(RSTRING_PTR(standard));
|
812
|
+
string* seqb = new string(RSTRING_PTR(seq));
|
813
|
+
trim(seqa);
|
814
|
+
trim(seqb);
|
815
|
+
degap(seqa);
|
816
|
+
degap(seqb);
|
817
|
+
string* newseqa = new string();
|
818
|
+
string* newseqb = new string();
|
819
|
+
align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
|
820
|
+
|
821
|
+
VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
|
822
|
+
|
823
|
+
delete seqa;
|
824
|
+
delete seqb;
|
825
|
+
delete newseqa;
|
826
|
+
delete newseqb;
|
827
|
+
|
828
|
+
return ret;
|
829
|
+
}
|
830
|
+
|
831
|
+
extern "C" VALUE align_it_aa(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
|
832
|
+
{
|
833
|
+
init_pairscore_aa(4, -2);
|
834
|
+
|
835
|
+
string* seqa = new string(RSTRING_PTR(standard));
|
836
|
+
string* seqb = new string(RSTRING_PTR(seq));
|
837
|
+
trim(seqa);
|
838
|
+
trim(seqb);
|
839
|
+
degap(seqa);
|
840
|
+
degap(seqb);
|
841
|
+
string* newseqa = new string();
|
842
|
+
string* newseqb = new string();
|
843
|
+
align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
|
844
|
+
|
845
|
+
VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
|
846
|
+
|
847
|
+
delete seqa;
|
848
|
+
delete seqb;
|
849
|
+
delete newseqa;
|
850
|
+
delete newseqb;
|
851
|
+
|
852
|
+
return ret;
|
853
|
+
}
|
854
|
+
|
855
|
+
extern "C" void Init_cfe_gotoh()
|
856
|
+
{
|
857
|
+
VALUE gotoh = rb_define_module("CfeGotoh");
|
858
|
+
rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4);
|
859
|
+
rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4);
|
860
|
+
}
|
861
|
+
|
862
|
+
#endif
|
data/lib/cfe_gotoh.rb
ADDED
@@ -0,0 +1,413 @@
|
|
1
|
+
#TODO: Scoring algorithm to improve frame_align?
|
2
|
+
|
3
|
+
require_relative 'cfe_gotoh/cfe_gotoh'
|
4
|
+
|
5
|
+
|
6
|
+
module CfeGotoh
|
7
|
+
class Error < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
class GapMergeError < Error
|
11
|
+
end
|
12
|
+
|
13
|
+
sub_matrix = Array.new(127) {Array.new(127) {-1.0} }
|
14
|
+
['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc|
|
15
|
+
sub_matrix[nuc.ord()][nuc.ord()] = 1.0
|
16
|
+
sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N')
|
17
|
+
end
|
18
|
+
#bi-mixtures
|
19
|
+
sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0
|
20
|
+
sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0
|
21
|
+
sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0
|
22
|
+
sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0
|
23
|
+
sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0
|
24
|
+
sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0
|
25
|
+
sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0
|
26
|
+
sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0
|
27
|
+
sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0
|
28
|
+
sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0
|
29
|
+
sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0
|
30
|
+
sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0
|
31
|
+
#tri-mixtures
|
32
|
+
sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0
|
33
|
+
sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0
|
34
|
+
sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0
|
35
|
+
sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0
|
36
|
+
sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0
|
37
|
+
sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0
|
38
|
+
sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0
|
39
|
+
sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0
|
40
|
+
sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0
|
41
|
+
sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0
|
42
|
+
sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0
|
43
|
+
sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0
|
44
|
+
#other
|
45
|
+
sub_matrix['$'.ord()]['$'.ord()]=50.0
|
46
|
+
sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0
|
47
|
+
sub_matrix['N'.ord()]['N'.ord()] = 0.0
|
48
|
+
sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0
|
49
|
+
['A','T','G','C'].each do |ch|
|
50
|
+
sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0
|
51
|
+
sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7
|
52
|
+
sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0
|
53
|
+
sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0
|
54
|
+
sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0
|
55
|
+
end
|
56
|
+
sub_matrix.each {|column| column.freeze}
|
57
|
+
sub_matrix.freeze
|
58
|
+
|
59
|
+
NUCLEOTIDE_MATRIX = sub_matrix
|
60
|
+
|
61
|
+
def self.score_alignment(standard, query)
|
62
|
+
sc = 0.0
|
63
|
+
0.upto(standard.size() - 1) do |i|
|
64
|
+
sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()]
|
65
|
+
end
|
66
|
+
return sc
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.make_gap_list(seq)
|
70
|
+
list = []
|
71
|
+
cur_ins = nil
|
72
|
+
prev_i = nil
|
73
|
+
0.upto(seq.size() - 1) do |i|
|
74
|
+
if(seq[i,1] == '-')
|
75
|
+
if(prev_i and i == prev_i + 1)
|
76
|
+
cur_ins << i
|
77
|
+
prev_i = i
|
78
|
+
else
|
79
|
+
list << cur_ins if(cur_ins != nil and cur_ins != [])
|
80
|
+
cur_ins = [i]
|
81
|
+
prev_i = i
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
list << cur_ins if(cur_ins != nil and cur_ins != [])
|
86
|
+
return list
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.trim_leading_dashes(standard, query)
|
90
|
+
leading_dashes_match = /^(-+)[^-]/.match(standard)
|
91
|
+
if (leading_dashes_match.nil?)
|
92
|
+
return
|
93
|
+
end
|
94
|
+
leading_dashes = leading_dashes_match[1]
|
95
|
+
standard[0, leading_dashes.size()] = ''
|
96
|
+
query[0, leading_dashes.size()] = ''
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.trim_trailing_dashes(standard, query)
|
100
|
+
trailing_dashes_match = /[^-](-+)$/.match(standard)
|
101
|
+
if (trailing_dashes_match.nil?)
|
102
|
+
return
|
103
|
+
end
|
104
|
+
trailing_dashes = trailing_dashes_match[1]
|
105
|
+
end_of_standard = standard.size() - trailing_dashes.size()
|
106
|
+
standard[end_of_standard, trailing_dashes.size()] = ''
|
107
|
+
query[end_of_standard, trailing_dashes.size()] = ''
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.fix_incomplete_edge_codon(query, side=:leading)
|
111
|
+
edge_idx = 0
|
112
|
+
dash_regex = /^(-+)[^-]/
|
113
|
+
incr = 1
|
114
|
+
if (side != :leading) # fix the trailing edge
|
115
|
+
edge_idx = -1
|
116
|
+
dash_regex = /[^-](-+)$/
|
117
|
+
incr = -1
|
118
|
+
end
|
119
|
+
|
120
|
+
if (query[edge_idx] == '-')
|
121
|
+
dashes = dash_regex.match(query)[1] # we know there will be a match
|
122
|
+
|
123
|
+
# If the length of the dashes aren't a multiple of 3, turn some
|
124
|
+
# of the query characters into dashes to force it to be a full
|
125
|
+
# codon of dashes.
|
126
|
+
if (dashes.size() % 3 >= 1)
|
127
|
+
first_non_dash_idx = dashes.size()
|
128
|
+
if (side != :leading)
|
129
|
+
first_non_dash_idx = query.size() - dashes.size() - 1
|
130
|
+
end
|
131
|
+
query[first_non_dash_idx] = '-'
|
132
|
+
if (dashes.size() % 3 == 1)
|
133
|
+
query[first_non_dash_idx + incr] = '-'
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.merge_insertions_and_deletions_to_fix_oof_sequences(
|
140
|
+
standard,
|
141
|
+
query
|
142
|
+
)
|
143
|
+
# Merge deletions and insertions until the sequences have a cogent length
|
144
|
+
# (i.e. have length divisible by 3). This helps fix poor insertions near
|
145
|
+
# the start of the sequence.
|
146
|
+
raise 'Standard and query should be the same length' if standard.size() != query.size()
|
147
|
+
if(standard.size() % 3 != 0)
|
148
|
+
dex = 0
|
149
|
+
while(dex = standard.index(/-/, dex))
|
150
|
+
[-1, 1, -2, 2].each do |offset| # look one base away, then two bases away
|
151
|
+
if ((dex + offset >= 0) and query[dex + offset] == '-')
|
152
|
+
standard[dex] = ''
|
153
|
+
query[dex + offset] = ''
|
154
|
+
dex = 0
|
155
|
+
break
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Stop if the sequences are now a cogent length.
|
160
|
+
if(standard.size() % 3 == 0)
|
161
|
+
break
|
162
|
+
end
|
163
|
+
dex += 1
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.cluster_gaps(gaps, raise_errors=false)
|
169
|
+
# Merge adjacent gaps if they are not a codon-sized gap.
|
170
|
+
new_gap_list = []
|
171
|
+
gaps.each_with_index do |gap, i|
|
172
|
+
next if(gap.size() == 0) # we already ate this one
|
173
|
+
if(gap.size() % 3 == 0) # this gap is fine!
|
174
|
+
new_gap_list << gap
|
175
|
+
next
|
176
|
+
end
|
177
|
+
|
178
|
+
gap2 = gaps[i + 1] # note: these could be nil, which we test for below
|
179
|
+
gap3 = gaps[i + 2]
|
180
|
+
# Can I merge with the next gap?
|
181
|
+
if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9)
|
182
|
+
if(gap2.size() > gap.size())
|
183
|
+
new_gap_list << ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2
|
184
|
+
else
|
185
|
+
new_gap_list << gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a()
|
186
|
+
end
|
187
|
+
gaps[i + 1] = []
|
188
|
+
# Can I merge with the next two gaps?
|
189
|
+
elsif(
|
190
|
+
gap2 and gap3 and
|
191
|
+
(gap + gap2 + gap3).size() % 3 == 0 and
|
192
|
+
(gap3.first - gap.last) < 12
|
193
|
+
)
|
194
|
+
# Place the gap around the middle of the three merging gaps.
|
195
|
+
new_gap = (
|
196
|
+
((gap2.first - gap.size()) .. gap2.first - 1).to_a() +
|
197
|
+
gap2 +
|
198
|
+
((gap2.last + 1) .. (gap2.last + gap3.size())).to_a()
|
199
|
+
)
|
200
|
+
new_gap_list << new_gap
|
201
|
+
|
202
|
+
gaps[i + 1] = []
|
203
|
+
gaps[i + 2] = []
|
204
|
+
else
|
205
|
+
# We can't merge the gaps; either raise an error or meekly proceed.
|
206
|
+
if (raise_errors)
|
207
|
+
raise GapMergeError
|
208
|
+
else
|
209
|
+
new_gap_list << gap # FIXME this behaviour differs between insertions and deletions
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
return new_gap_list
|
214
|
+
end
|
215
|
+
|
216
|
+
def self.align_gaps_to_frame(gaps, common_gap_locations=nil)
|
217
|
+
# Align gaps to codon boundaries, giving preference to common
|
218
|
+
# gap locations if specified.
|
219
|
+
# Gaps must be listed in ascending order, i.e. from left to right.
|
220
|
+
|
221
|
+
offset = 0 # offset created by previous gaps.
|
222
|
+
gaps.each do |gap|
|
223
|
+
# See if this gap is close to a common gap location (within 3 amino acids).
|
224
|
+
if (!common_gap_locations.nil?)
|
225
|
+
closest_common = common_gap_locations.min() do |a, b|
|
226
|
+
(3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs()
|
227
|
+
end
|
228
|
+
if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9)
|
229
|
+
# Align the gap to this position.
|
230
|
+
new_gap = []
|
231
|
+
0.upto(gap.size() - 1) do |i|
|
232
|
+
new_gap << 3 * closest_common + i + offset
|
233
|
+
end
|
234
|
+
gap.replace(new_gap)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Align the gap to the nearest appropriate frame.
|
239
|
+
# Original comment from Conan: scoring would be good here
|
240
|
+
if(gap[0] % 3 == 1) # set back one base
|
241
|
+
new_gap = []
|
242
|
+
gap.each do |i|
|
243
|
+
new_gap << i - 1
|
244
|
+
end
|
245
|
+
gap.replace(new_gap)
|
246
|
+
elsif(gap[0] % 3 == 2) # set forward one base
|
247
|
+
new_gap = []
|
248
|
+
gap.each do |i|
|
249
|
+
new_gap << i + 1
|
250
|
+
end
|
251
|
+
gap.replace(new_gap)
|
252
|
+
end
|
253
|
+
|
254
|
+
offset += gap.size()
|
255
|
+
end
|
256
|
+
return gaps
|
257
|
+
end
|
258
|
+
|
259
|
+
def self.splice_gaps_into_sequence(seq, gaps)
|
260
|
+
# Place the specified gaps into the sequence. Note that the
|
261
|
+
# gaps are specified by their positions in an *aligned* sequence,
|
262
|
+
# and as such include "offsets" introduced by gaps placed earlier
|
263
|
+
# in the sequence. The gaps must be in left-to-right order.
|
264
|
+
seq = seq.gsub('-','')
|
265
|
+
gaps.each do |gap|
|
266
|
+
gap.each do |i|
|
267
|
+
if(i > seq.size())
|
268
|
+
seq.insert(-1, '-')
|
269
|
+
else
|
270
|
+
seq.insert(i, '-')
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
return seq
|
275
|
+
end
|
276
|
+
|
277
|
+
#common_insert_locations is based on amino acid locations starting at base 0.
|
278
|
+
#Assumes standard in the first base.
|
279
|
+
#Prealign lets you run a lot of the corrections and qc on a already aligned sequence.
|
280
|
+
def self.frame_align(
|
281
|
+
standard,
|
282
|
+
query,
|
283
|
+
gap_init=3,
|
284
|
+
gap_penalty=1,
|
285
|
+
common_insert_locations=nil,
|
286
|
+
trim=false,
|
287
|
+
raise_errors=false,
|
288
|
+
prealigned=false
|
289
|
+
)
|
290
|
+
if (common_insert_locations.nil?)
|
291
|
+
common_insert_locations = []
|
292
|
+
end
|
293
|
+
if(!prealigned)
|
294
|
+
elem = align_it(standard, query, gap_init, gap_penalty)
|
295
|
+
standard = elem[0]
|
296
|
+
query = elem[1]
|
297
|
+
end
|
298
|
+
raise "Standard and query should be the same length" if standard.size() != query.size()
|
299
|
+
|
300
|
+
# Trim leading and trailing dashes if desired.
|
301
|
+
if (trim)
|
302
|
+
trim_leading_dashes(standard, query)
|
303
|
+
trim_trailing_dashes(standard, query)
|
304
|
+
fix_incomplete_edge_codon(query, :leading)
|
305
|
+
fix_incomplete_edge_codon(query, :trailing)
|
306
|
+
end
|
307
|
+
|
308
|
+
merge_insertions_and_deletions_to_fix_oof_sequences(standard, query)
|
309
|
+
|
310
|
+
if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
|
311
|
+
raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3"
|
312
|
+
end
|
313
|
+
if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
|
314
|
+
raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3"
|
315
|
+
end
|
316
|
+
|
317
|
+
# Build the insert/delete lists. These lists look like
|
318
|
+
# [[3,4,5], [9], [11,12]]
|
319
|
+
insert_list = make_gap_list(standard)
|
320
|
+
delete_list = make_gap_list(query)
|
321
|
+
|
322
|
+
# Process the insertions.
|
323
|
+
if(insert_list.size() > 0)
|
324
|
+
new_ins_list = []
|
325
|
+
|
326
|
+
# Step 1: cluster the insertions.
|
327
|
+
begin
|
328
|
+
new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)
|
329
|
+
rescue GapMergeError
|
330
|
+
raise "Cannot frame align insert" if raise_errors
|
331
|
+
end
|
332
|
+
|
333
|
+
# Step 2: frame-align the insertions, shifting things to common insertion
|
334
|
+
# positions where appropriate.
|
335
|
+
align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations)
|
336
|
+
|
337
|
+
# Put the insertions back into the standard.
|
338
|
+
standard = splice_gaps_into_sequence(standard, new_ins_list)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Process the deletions.
|
342
|
+
if(delete_list.size() > 0)
|
343
|
+
new_del_list = []
|
344
|
+
|
345
|
+
# As above, step 1 is to cluster the deletions.
|
346
|
+
# FIXME note that the original code behaved differently between
|
347
|
+
# insertions and deletions; confirm that this is the right
|
348
|
+
# way forward.
|
349
|
+
begin
|
350
|
+
new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors)
|
351
|
+
rescue GapMergeError
|
352
|
+
raise "Cannot frame align deletion" if raise_errors
|
353
|
+
end
|
354
|
+
|
355
|
+
# Again as above, frame-align the deletions; this time
|
356
|
+
# we don't worry about any common deletion positions.
|
357
|
+
align_gaps_to_frame(new_del_list)
|
358
|
+
|
359
|
+
# Put the deletions back into the query.
|
360
|
+
query = splice_gaps_into_sequence(query, new_del_list)
|
361
|
+
end
|
362
|
+
|
363
|
+
return [standard, query]
|
364
|
+
end
|
365
|
+
|
366
|
+
#Returns a [seq_sans_inserts, [list of inserts]]
|
367
|
+
def self.remove_inserts(elem)
|
368
|
+
return remove_insertions_from_query(elem[0], elem[1])
|
369
|
+
end
|
370
|
+
|
371
|
+
def self.remove_insertions_from_query(standard, query)
|
372
|
+
seq = '' + query
|
373
|
+
inserts = []
|
374
|
+
|
375
|
+
insert_list = []
|
376
|
+
0.upto(standard.size() - 1) do |i|
|
377
|
+
insert_list << i if(standard[i,1] == '-')
|
378
|
+
end
|
379
|
+
|
380
|
+
big_insert_list = []
|
381
|
+
if(standard.include?('-'))#Inserts first
|
382
|
+
#First step should be to cluster inserts
|
383
|
+
cur_ins = nil
|
384
|
+
prev_i = nil
|
385
|
+
insert_list.each do |i|
|
386
|
+
if(prev_i and i == prev_i + 1)
|
387
|
+
cur_ins << i
|
388
|
+
prev_i = i
|
389
|
+
else
|
390
|
+
big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
|
391
|
+
cur_ins = [i]
|
392
|
+
prev_i = i
|
393
|
+
end
|
394
|
+
end
|
395
|
+
big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
|
396
|
+
end
|
397
|
+
|
398
|
+
offset = 0
|
399
|
+
big_insert_list.each do |ins|
|
400
|
+
ins_seq = ''
|
401
|
+
ins.each do |i|
|
402
|
+
ins_seq += query[i,1]
|
403
|
+
end
|
404
|
+
inserts << [((ins[0] - offset) / 3), ins_seq]
|
405
|
+
offset += ins.size()
|
406
|
+
ins.each do |i|
|
407
|
+
seq[i,1] = '.'
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
return [seq.gsub('.',''), inserts]
|
412
|
+
end
|
413
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cfe_gotoh
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0.pre
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Conan Woods
|
8
|
+
- Jamie Kai
|
9
|
+
- David Rickett
|
10
|
+
- Richard Liang
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2024-11-22 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
executables: []
|
19
|
+
extensions:
|
20
|
+
- ext/cfe_gotoh/extconf.rb
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- ext/cfe_gotoh/cfe_gotoh.cpp
|
24
|
+
- ext/cfe_gotoh/extconf.rb
|
25
|
+
- lib/cfe_gotoh.rb
|
26
|
+
homepage:
|
27
|
+
licenses: []
|
28
|
+
metadata:
|
29
|
+
github_repo: ssh://github.com/cfe-lab/gotoh
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.3.1
|
44
|
+
requirements: []
|
45
|
+
rubygems_version: 3.0.9
|
46
|
+
signing_key:
|
47
|
+
specification_version: 4
|
48
|
+
summary: CfE implementation of the Gotoh sequence alignment algorithm
|
49
|
+
test_files: []
|