cfe_gotoh 0.4.0.pre

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 69fd99e925d82fb14d1035e8461fd32d6d83895ae9bf012efea277596772e7ce
4
+ data.tar.gz: 92b0780c678fa34b4be4d2148df83ed1f60c9d63c15a5742f3c17fa1cb4beb24
5
+ SHA512:
6
+ metadata.gz: ee778affdf1b42aca92b9acbedf6368df7b906a0b5ab6793132b126f04b781baa2a88a04254b0aa9c51a2f2dd3b09b9b0713edbba9ba85db86e26b6eaa0a5745
7
+ data.tar.gz: 01f949ecf25c278706fe4718b59e9ac3aa6e643d8ab3063a5fa1aba390d7f2fd3b28e7dbe1703fce94a84affd7b6573e1bbac29f117bb3442d7a6ef199fd8cbe
@@ -0,0 +1,862 @@
1
+ #include <string>
2
+
3
+ #ifdef __PYTHON__
4
+ #include <Python.h>
5
+ #else
6
+ #include "ruby.h"
7
+
8
+ #ifndef RSTRING_PTR
9
+ // Ruby 1.8.5 doesn't include this definition
10
+ #define RSTRING_PTR(s) (RSTRING(s)->ptr)
11
+ #endif
12
+ #endif
13
+
14
+ using namespace std;
15
+
16
+ /*
17
+ I think this application should do a complete alignment. Unfortunately aligning is way too slow in perl,
18
+ so I suspect the alignment, merging and possibly the gap widening should be done in c. Another possibility
19
+ is to call the C functions from perl, which would simplify things quite a bit! Unfortunately, I'm not entirely
20
+ confident in that perl can do this seamlessly(unlike nicer languages like ruby and python).
21
+ */
22
+
23
+ void trim(string* seq);
24
+
25
+ static int nucMat[127][127]; // ASCII runs from 0 to 127
26
+ void init_pairscore(int matchscore, int mismatchPenalty)
27
+ {
28
+ for (int i=0; i<127; i++)
29
+ {
30
+ for (int j=0; j<127; j++)
31
+ {
32
+ if (i==j)
33
+ {
34
+ nucMat[i][j]=matchscore;
35
+ }
36
+ else
37
+ {
38
+ nucMat[i][j]=-mismatchPenalty;
39
+ // if ((char)i=='N' || (char)i=='n' || (char)j=='N' || (char)j=='n')
40
+ // {
41
+ // nucMat[i][j]=-mismatchPenalty;
42
+ // }
43
+ }
44
+ }
45
+ }
46
+
47
+ // adjust naive assignments for case-insensitivity
48
+ nucMat['a']['A']=nucMat['A']['a']=matchscore;
49
+ nucMat['c']['C']=nucMat['C']['c']=matchscore;
50
+ nucMat['g']['G']=nucMat['G']['g']=matchscore;
51
+ nucMat['t']['T']=nucMat['T']['t']=nucMat['u']['U']=nucMat['U']['u']=matchscore;
52
+ nucMat['t']['u']=nucMat['t']['U']=nucMat['T']['u']=nucMat['T']['U']=matchscore;
53
+ nucMat['u']['t']=nucMat['t']['T']=nucMat['U']['t']=nucMat['U']['T']=matchscore;
54
+ nucMat['N']['N']=nucMat['n']['N']=nucMat['N']['n']=0;
55
+
56
+
57
+ //bi-mixtures
58
+ nucMat['A']['R']=nucMat['R']['A']=matchscore;
59
+ nucMat['G']['R']=nucMat['R']['G']=matchscore;
60
+
61
+ nucMat['C']['Y']=nucMat['Y']['C']=matchscore;
62
+ nucMat['T']['Y']=nucMat['Y']['T']=matchscore;
63
+
64
+ nucMat['G']['K']=nucMat['K']['G']=matchscore;
65
+ nucMat['T']['K']=nucMat['K']['T']=matchscore;
66
+
67
+ nucMat['C']['M']=nucMat['M']['C']=matchscore;
68
+ nucMat['A']['M']=nucMat['M']['A']=matchscore;
69
+
70
+ nucMat['C']['S']=nucMat['S']['C']=matchscore;
71
+ nucMat['G']['S']=nucMat['S']['G']=matchscore;
72
+
73
+ nucMat['T']['W']=nucMat['W']['T']=matchscore;
74
+ nucMat['A']['W']=nucMat['W']['A']=matchscore;
75
+
76
+ //tri-mixtures
77
+ nucMat['C']['B']=nucMat['B']['C']=matchscore;
78
+ nucMat['G']['B']=nucMat['B']['G']=matchscore;
79
+ nucMat['T']['B']=nucMat['B']['T']=matchscore;
80
+
81
+ nucMat['A']['D']=nucMat['D']['A']=matchscore;
82
+ nucMat['G']['D']=nucMat['D']['G']=matchscore;
83
+ nucMat['T']['D']=nucMat['D']['T']=matchscore;
84
+
85
+ nucMat['A']['H']=nucMat['H']['A']=matchscore;
86
+ nucMat['C']['H']=nucMat['H']['C']=matchscore;
87
+ nucMat['T']['H']=nucMat['H']['T']=matchscore;
88
+
89
+ nucMat['A']['V']=nucMat['V']['A']=matchscore;
90
+ nucMat['C']['V']=nucMat['V']['C']=matchscore;
91
+ nucMat['G']['V']=nucMat['V']['G']=matchscore;
92
+
93
+ //Wild cards
94
+ nucMat['*']['A']=nucMat['*']['a']=nucMat['A']['*']=nucMat['a']['*']=matchscore;
95
+ nucMat['*']['C']=nucMat['*']['c']=nucMat['C']['*']=nucMat['c']['*']=matchscore;
96
+ nucMat['*']['T']=nucMat['*']['t']=nucMat['T']['*']=nucMat['t']['*']=matchscore;
97
+ nucMat['*']['G']=nucMat['*']['g']=nucMat['G']['*']=nucMat['g']['*']=matchscore;
98
+
99
+ nucMat['$']['$']=50;
100
+ // nucMat['$']['A']=nucMat['$']['a']=nucMat['A']['$']=nucMat['a']['$']=0;
101
+ // nucMat['$']['T']=nucMat['$']['t']=nucMat['T']['$']=nucMat['t']['$']=0;
102
+ // nucMat['$']['G']=nucMat['$']['g']=nucMat['G']['$']=nucMat['g']['$']=0;
103
+
104
+ //For those annoying duplicate phred values.
105
+ nucMat['.']['A']=nucMat['.']['a']=nucMat['A']['.']=nucMat['a']['.']=-20;
106
+ nucMat['.']['C']=nucMat['.']['c']=nucMat['C']['.']=nucMat['c']['.']=-20;
107
+ nucMat['.']['T']=nucMat['.']['t']=nucMat['T']['.']=nucMat['t']['.']=-20;
108
+ nucMat['.']['G']=nucMat['.']['g']=nucMat['G']['.']=nucMat['g']['.']=-20;
109
+
110
+ nucMat['N']['A']=nucMat['N']['a']=nucMat['A']['N']=nucMat['a']['N']=-3;
111
+ nucMat['N']['C']=nucMat['N']['c']=nucMat['C']['N']=nucMat['c']['N']=-3;
112
+ nucMat['N']['T']=nucMat['N']['t']=nucMat['T']['N']=nucMat['t']['N']=-3;
113
+ nucMat['N']['G']=nucMat['N']['g']=nucMat['G']['N']=nucMat['g']['N']=-3;
114
+
115
+ //for easy alignment to a standard with gaps
116
+ nucMat['X']['A']=nucMat['X']['a']=nucMat['A']['X']=nucMat['a']['X']=-6;
117
+ nucMat['X']['C']=nucMat['X']['c']=nucMat['C']['X']=nucMat['c']['X']=-6;
118
+ nucMat['X']['T']=nucMat['X']['t']=nucMat['T']['X']=nucMat['t']['X']=-6;
119
+ nucMat['X']['G']=nucMat['X']['g']=nucMat['G']['X']=nucMat['g']['X']=-6;
120
+ nucMat['X']['R']=nucMat['X']['r']=nucMat['R']['X']=nucMat['r']['X']=-6;
121
+ nucMat['X']['Y']=nucMat['X']['y']=nucMat['Y']['X']=nucMat['y']['X']=-6;
122
+ nucMat['X']['K']=nucMat['X']['k']=nucMat['K']['X']=nucMat['k']['X']=-6;
123
+ nucMat['X']['M']=nucMat['X']['m']=nucMat['M']['X']=nucMat['m']['X']=-6;
124
+ nucMat['X']['S']=nucMat['X']['s']=nucMat['S']['X']=nucMat['s']['X']=-6;
125
+ nucMat['X']['W']=nucMat['X']['w']=nucMat['W']['X']=nucMat['w']['X']=-6;
126
+ nucMat['X']['B']=nucMat['X']['b']=nucMat['B']['X']=nucMat['b']['X']=-6;
127
+ nucMat['X']['D']=nucMat['X']['d']=nucMat['D']['X']=nucMat['d']['X']=-6;
128
+ nucMat['X']['H']=nucMat['X']['h']=nucMat['H']['X']=nucMat['h']['X']=-6;
129
+ nucMat['X']['V']=nucMat['X']['v']=nucMat['V']['X']=nucMat['v']['X']=-6;
130
+ nucMat['X']['-']=nucMat['X']['-']=3;
131
+ }
132
+
133
+
134
+ void init_pairscore_aa(int matchscore, int mismatchPenalty)
135
+ {
136
+ for (int i=0; i<127; i++)
137
+ {
138
+ for (int j=0; j<127; j++)
139
+ {
140
+ if(i==j)
141
+ {
142
+ nucMat[i][j]=matchscore;
143
+ }
144
+ else
145
+ {
146
+ nucMat[i][j]=-mismatchPenalty;
147
+ if((char)i=='X' || (char)j=='X')
148
+ {
149
+ nucMat[i][j]=-4;
150
+ }
151
+ }
152
+ }
153
+ }
154
+
155
+ nucMat['Z']['Z']=nucMat['z']['Z']=nucMat['Z']['z']=0;
156
+ nucMat['X']['-']=nucMat['-']['X']=matchscore;
157
+ }
158
+
159
+
160
+ /*
161
+ Empirical score matrix based on 25% divergent HIV sequences
162
+ See Nickle, David C., et al. "HIV-specific probabilistic models of protein evolution."
163
+ PLoS One 2.6 (2007): e503.
164
+ */
165
+ static int empirical_hiv25[24][24] = {\
166
+ {7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},\
167
+ {-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},\
168
+ {-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},\
169
+ {-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},\
170
+ {-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},\
171
+ {-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},\
172
+ {-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},\
173
+ {-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},\
174
+ {-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},\
175
+ {-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},\
176
+ {-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},\
177
+ {-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},\
178
+ {-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},\
179
+ {-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},\
180
+ {-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},\
181
+ {-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},\
182
+ {1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},\
183
+ {-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},\
184
+ {-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},\
185
+ {0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},\
186
+ {-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},\
187
+ {-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},\
188
+ {-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},\
189
+ {-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
190
+
191
+ void init_pairscore_hiv25(void) {
192
+ // ASCII codes for protein alphabet ARNDCQEGHILKMFPSTWYVBZ?*
193
+ int aa_to_ascii[24] = { 65, 82, 78, 68, 67, 81, 69, 71, 72, 73, 76, 75, 77, 70, 80, 83, 84,
194
+ 87, 89, 86, 66, 90, 63, 42 };
195
+ int i2, j2;
196
+
197
+ // reset score matrix to be safe
198
+ for (int i=0; i<127; i++) {
199
+ for (int j=0; j<127; j++) {
200
+ nucMat[i][j] = 0;
201
+ }
202
+ }
203
+
204
+ // map HIV 25% empirical matrix to score matrix
205
+ for (int i=0; i<24; i++) {
206
+ i2 = aa_to_ascii[i];
207
+ for (int j=0; j<24; j++) {
208
+ j2 = aa_to_ascii[j];
209
+ // also map to lowercase
210
+ nucMat[i2+32][j2+32] = nucMat[i2+32][j2] = nucMat[i2][j2+32] = nucMat[i2][j2] = empirical_hiv25[i][j];
211
+ }
212
+ }
213
+ }
214
+
215
+
216
+ extern int pairscore(char a, char b)
217
+ {
218
+ return nucMat[a][b];
219
+ }
220
+
221
+ void reverse(string* seq)
222
+ {
223
+ string tmp = "";
224
+ for(int i = seq->size() - 1; i >= 0; --i)
225
+ {
226
+ tmp += (*seq)[i];
227
+ }
228
+ *seq = tmp;
229
+ }
230
+
231
+
232
+ //Error must be somewhere in here. Ug...
233
+ int align(string* seqa, string* seqb, string* newseqa, string* newseqb,
234
+ int gip, int gep, int use_terminal_gap_penalty)
235
+ {
236
+ /*
237
+ Pairwise alignment with affine gap penalty.
238
+ see Gotoh, Osamu. "Optimal alignment between groups of sequences and its application
239
+ to multiple sequence alignment." Computer applications in the biosciences: CABIOS 9.3
240
+ (1993): 361-370.
241
+
242
+ Gap open and extension penalties [gip] and [gep] are assumed to take positive values.
243
+ */
244
+
245
+ int M = seqa->size(); // first group of pre-aligned sequences
246
+ int N = seqb->size(); // second group
247
+
248
+ // if empty ref, return seqb as-is, and seqa as gaps of size(seqb)
249
+ // prevents a buffer overflow in the traceback matrices which assume M>0
250
+ if (M==0)
251
+ {
252
+ int j;
253
+ int alignment_score=0;
254
+ for (j=0 ; j < N ; j++)
255
+ {
256
+ //skip terminal (whole seq) gap penalties if user specifies this option
257
+ if (use_terminal_gap_penalty==0) alignment_score += (j==0) ? (gip+gep) : gep ;
258
+ *newseqa += '-';
259
+ *newseqb += (*seqb)[j];;
260
+ }
261
+
262
+ return alignment_score;
263
+ }
264
+
265
+ int i, j;
266
+
267
+ // not all elements of D, P, and Q need to be stored - vectors are adequate
268
+ int *SS=new int[N+1]; // D(i, .)
269
+ int *oldSS=new int[N+1]; // D(i-1, .)
270
+ int *PP = new int[N+1]; // P(i, .)
271
+
272
+ // Gotoh traceback matrices
273
+ int **piSS = new int*[M+1];
274
+ int **pjSS = new int*[M+1];
275
+
276
+ int u = -gip; // affine gap initiation penalty
277
+ int v = -gep; // affine gap extension penalty
278
+
279
+ int w1 = u + v; // gap weight w_k = v * k + u for k = 1
280
+ int t = u;
281
+ int s, q;
282
+
283
+ // initialize vectors
284
+ for (j=0; j<N+1; j++)
285
+ {
286
+ SS[j]=0;
287
+ oldSS[j]=0;
288
+ PP[j]=0;
289
+ }
290
+
291
+ // initialize traceback matrices
292
+ piSS[0] = new int[N+1];
293
+ pjSS[0] = new int[N+1];
294
+ piSS[1] = new int[N+1];
295
+ pjSS[1] = new int[N+1];
296
+ piSS[1][0] = 0;
297
+ pjSS[1][0] = 0;
298
+ piSS[0][1] = 0;
299
+ pjSS[0][1] = 0;
300
+
301
+ int maxiS = -100000;
302
+ int maxjS = -100000;
303
+ int maxij, maxji;
304
+
305
+ for (i=1; i < M+1; i++)
306
+ {
307
+ t += v; // update gap extension
308
+ s = t;
309
+ SS[0]=0;
310
+ q = t + u;
311
+
312
+ // add new rows
313
+ if (i>1)
314
+ {
315
+ piSS[i] = new int[N+1];
316
+ pjSS[i] = new int[N+1];
317
+ }
318
+
319
+ for (j = 1; j < N + 1; j++)
320
+ {
321
+ // recursive calculation of Q
322
+ if (q >= s + u )
323
+ q += v; // extension
324
+ else
325
+ q = s + u + v; // open
326
+
327
+ // recursive calculation of P
328
+ if ((oldSS[j] + w1) > (PP[j] + v))
329
+ PP[j] = oldSS[j] + w1;
330
+ else
331
+ PP[j] += v;
332
+
333
+ int tmp_pp = PP[j];
334
+
335
+ // D(i-1, j-1) + d(a_i, b_j)
336
+ int pscore = oldSS[j - 1] + pairscore((*seqa)[i - 1], (*seqb)[j - 1]);
337
+
338
+ //no idea if this will work, but its supposed to be a stop codon aligner
339
+ //the bonus is assigned on the last codon, if the codons between don't make a big difference it'll be wrong. Hrm.
340
+
341
+ if(i >= 3 && j >= 3 && (*seqa)[i-3] == '$' && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' &&
342
+ (((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'G') ||
343
+ ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'A') ||
344
+ ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'G' && (*seqb)[j-1] == 'A') ))
345
+ {
346
+ pscore += 6;
347
+ }
348
+ if(i >= 3 && j >= 3 && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' &&
349
+ (((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'G') ||
350
+ ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'A') ||
351
+ ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'G' && (*seqb)[j-0] == 'A') ))
352
+ {
353
+ pscore += 6;
354
+ }
355
+ if(i >= 3 && j >= 3 && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' && (*seqa)[i+1] == '$' &&
356
+ (((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'G') ||
357
+ ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'A') ||
358
+ ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'G' && (*seqb)[j+1] == 'A') ))
359
+ {
360
+ pscore += 6;
361
+ }
362
+
363
+ /*
364
+ D(i,j) = Min { D(i-1, j-1) + d(a_i, b_j), P(i,j), Q(i,j) }
365
+ where P(i,j) = Min { D(i-k, j) + w_k } for k = 1, .., i
366
+ and Q(i,j) = Min { D(i, j-k) + w_k } for k = 1, ..., j
367
+
368
+ i.e., three options are:
369
+ 1. match/mismatch,
370
+ 2. gap open/extension in sequence (a),
371
+ 3. gap open/extension in sequence (b)
372
+
373
+ pscore = D(i-1, j-1) + d(a_i, b_j)
374
+ tmp_pp = P(i,j)
375
+ q = Q(i,j)
376
+ */
377
+
378
+ //maybe just >?
379
+ if (tmp_pp >= pscore)
380
+ {
381
+ if (tmp_pp > q)
382
+ {
383
+ // gap open / extension in (a)
384
+ s = tmp_pp;
385
+ piSS[i][j] = i - 1;
386
+ pjSS[i][j] = j;
387
+ }
388
+ else // q > tmp_pp > pscore
389
+ {
390
+ // gap open / extension in (b)
391
+ s = q;
392
+ piSS[i][j] = i;
393
+ pjSS[i][j] = j - 1;
394
+ }
395
+ }
396
+ else // pscore > tmp_pp)
397
+ {
398
+ if (pscore > q)
399
+ {
400
+ // match / mismatch
401
+ s = pscore;
402
+ piSS[i][j] = i - 1;
403
+ pjSS[i][j] = j - 1;
404
+ }
405
+ else // q > pscore > tmp_pp
406
+ {
407
+ // gap open / extension in (b)
408
+ s = q;
409
+ piSS[i][j] = i;
410
+ pjSS[i][j] = j - 1;
411
+ }
412
+ }
413
+
414
+ SS[j] = s;
415
+
416
+ if (i == M && SS[j] >= maxiS)
417
+ {
418
+ maxiS = SS[j];
419
+ maxij = j;
420
+ }
421
+ }
422
+
423
+ if (SS[N] >= maxjS)
424
+ {
425
+ maxjS = SS[N];
426
+ maxji = i;
427
+ }
428
+
429
+ for (j = 0; j < N + 1; j++)
430
+ {
431
+ oldSS[j] = SS[j];
432
+ }
433
+ }
434
+
435
+ if (maxij>N)
436
+ maxij=N;
437
+ if (maxji>M)
438
+ maxji=M;
439
+ if (maxij<0)
440
+ maxij=0;
441
+ if (maxji<0)
442
+ maxji=0;
443
+
444
+ //add starting -'s
445
+ int alignment_score;
446
+ if (maxiS > maxjS)
447
+ {
448
+ alignment_score = maxiS;
449
+ i = M;
450
+ j = maxij;
451
+ for (int kk = N; kk > maxij; kk--)
452
+ {
453
+ *newseqb += (*seqb)[kk - 1];
454
+ *newseqa += '-';
455
+ }
456
+ }
457
+ else
458
+ {
459
+ alignment_score = maxjS;
460
+ i = maxji;
461
+ j = N;
462
+ for (int kk = M; kk > maxji; kk--)
463
+ {
464
+ *newseqa += (*seqa)[kk - 1];
465
+ *newseqb += '-';
466
+ }
467
+ }
468
+
469
+ bool decI = false;
470
+ bool decJ = false;
471
+ //inserting -'s in the middle!
472
+ while(i >= 1 && j >= 1)
473
+ {
474
+ decI=false;
475
+ decJ=false;
476
+ if (piSS[i][j] < i)
477
+ {
478
+ *newseqa += (*seqa)[i - 1];
479
+ decI = true;
480
+ }
481
+ else
482
+ {
483
+ *newseqa += '-';
484
+ }
485
+
486
+ if (pjSS[i][j] < j)
487
+ {
488
+ *newseqb += (*seqb)[j - 1];
489
+ decJ=true;
490
+ }
491
+ else
492
+ {
493
+ *newseqb += '-';
494
+ }
495
+
496
+ if (decI)
497
+ {
498
+ i--;
499
+ }
500
+ if (decJ)
501
+ {
502
+ j--;
503
+ }
504
+ }
505
+
506
+ //add extra trailing -'s
507
+ //forgive terminal gap penalties if user specifies this option
508
+ if (i < j)
509
+ {
510
+ for (int jj = j; jj >= 1; jj--)
511
+ {
512
+ *newseqb += (*seqb)[jj - 1];
513
+ *newseqa += '-';
514
+ if (use_terminal_gap_penalty==0) alignment_score += gep;
515
+ }
516
+ if (use_terminal_gap_penalty==0) alignment_score += gip;
517
+ }
518
+ else if(i > j)
519
+ {
520
+ for (int ii = i; ii >= 1; ii--)
521
+ {
522
+ *newseqa += (*seqa)[ii - 1];
523
+ *newseqb += '-';
524
+ if (use_terminal_gap_penalty==0) alignment_score += gep;
525
+ }
526
+ if (use_terminal_gap_penalty==0) alignment_score += gip;
527
+ }
528
+
529
+ reverse(newseqa);
530
+ reverse(newseqb);
531
+
532
+ for (i = 0; i < M + 1; i++)
533
+ {
534
+ delete []piSS[i];
535
+ delete []pjSS[i];
536
+ }
537
+
538
+ delete []SS;
539
+ delete []oldSS;
540
+ delete []piSS;
541
+ delete []pjSS;
542
+ delete []PP;
543
+ return alignment_score;
544
+ }
545
+
546
+ void degap(string* seq)
547
+ {
548
+ /*
549
+ Remove pre-existing gap characters from sequences prior to alignment.
550
+ */
551
+ unsigned int pos = 0;
552
+ while(pos != -1)
553
+ {
554
+ pos = seq->find('-', 0);
555
+ if(pos != -1)
556
+ {
557
+ seq->erase(pos, 1);
558
+ }
559
+ }
560
+ }
561
+
562
+ void trim(string* seq)
563
+ {
564
+ /*
565
+ Remove trailing whitespace from sequences.
566
+ */
567
+ while((*seq)[0] == ' ' || (*seq)[0] == '\t' || (*seq)[0] == '\n' || (*seq)[0] == '\r')
568
+ {
569
+ seq->erase(0, 1);
570
+ }
571
+
572
+ while((*seq)[seq->size() - 1] == ' ' || (*seq)[seq->size() - 1] == '\t' || (*seq)[seq->size() - 1] == '\n' || (*seq)[seq->size() - 1] == '\r')
573
+ {
574
+ seq->erase(seq->size() - 1, 1);
575
+ }
576
+ }
577
+
578
+
579
+ void widen_gaps(string* seq)
580
+ {
581
+ int size = seq->size();
582
+ for(int i = 0; i < size; i++)
583
+ {
584
+ if((*seq)[i] == '-')
585
+ { //start searching for gaps to cluster
586
+
587
+ //backwards, seqa
588
+ unsigned int j = i - 1;
589
+ int letter = (*seq)[j];
590
+ j--;
591
+ while(j >= 0)
592
+ {
593
+ if((*seq)[j] == '-')
594
+ {
595
+ //woo, swap this with i - 1
596
+ (*seq)[j] = letter;
597
+ (*seq)[i - 1] = '-';
598
+ break;
599
+ }
600
+ else if((*seq)[j] == letter)
601
+ {
602
+ //nothing really
603
+ }
604
+ else if((*seq)[j] != letter)
605
+ {
606
+ break;
607
+ }
608
+ j--;
609
+ }
610
+
611
+
612
+ //forward, seqa
613
+ j = i + 1;
614
+ letter = (*seq)[j];
615
+ j++;
616
+ while(j < seq->size())
617
+ {
618
+ if((*seq)[j] == '-')
619
+ {
620
+ //woo, swap this with i + 1
621
+ (*seq)[j] = letter;
622
+ (*seq)[i + 1] = '-';
623
+ break;
624
+ }
625
+ else if((*seq)[j] == letter)
626
+ {
627
+ //nothing really
628
+ }
629
+ else if((*seq)[j] != letter)
630
+ {
631
+ break;
632
+ }
633
+ j++;
634
+ }
635
+ }
636
+ }
637
+ }
638
+
639
+ #ifdef __PYTHON__
640
+ /* Python wrapper functions */
641
+ static PyObject * align_it(PyObject * self, PyObject * args)
642
+ {
643
+ const char * standard;
644
+ const char * seq;
645
+ int gap_init_penalty;
646
+ int gap_extend_penalty;
647
+ int use_terminal_gap_penalty;
648
+ int score;
649
+
650
+ if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
651
+ return NULL;
652
+ }
653
+
654
+ init_pairscore(5, 4); // match, mismatch scores +5, -4 respectively (HyPhy defaults)
655
+
656
+ string* seqa = new string(standard);
657
+ string* seqb = new string(seq);
658
+ trim(seqa);
659
+ trim(seqb);
660
+ //degap(seqa);
661
+ //degap(seqb);
662
+ string* newseqa = new string();
663
+ string* newseqb = new string();
664
+
665
+ score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
666
+
667
+ PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
668
+
669
+ delete seqa;
670
+ delete seqb;
671
+ delete newseqa;
672
+ delete newseqb;
673
+
674
+ return retval;
675
+ }
676
+
677
+ static PyObject * align_it_rb(PyObject * self, PyObject * args)
678
+ {
679
+ // emulate Ruby implementation of align_it
680
+ const char * standard;
681
+ const char * seq;
682
+ int gap_init_penalty;
683
+ int gap_extend_penalty;
684
+
685
+ if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
686
+ return NULL;
687
+ }
688
+
689
+ init_pairscore(1, 1);
690
+
691
+ string* seqa = new string(standard); // reference
692
+ string* seqb = new string(seq); // query
693
+ trim(seqa);
694
+ trim(seqb);
695
+ degap(seqa);
696
+ degap(seqb);
697
+ string* newseqa = new string();
698
+ string* newseqb = new string();
699
+
700
+ align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
701
+
702
+ PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
703
+ delete seqa;
704
+ delete seqb;
705
+ delete newseqa;
706
+ delete newseqb;
707
+
708
+ return retval;
709
+ }
710
+
711
+ static PyObject * align_it_aa(PyObject * self, PyObject * args)
712
+ {
713
+ const char * standard;
714
+ const char * seq;
715
+ int gap_init_penalty;
716
+ int gap_extend_penalty;
717
+ int use_terminal_gap_penalty;
718
+ int score;
719
+
720
+ if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
721
+ return NULL;
722
+ }
723
+
724
+ init_pairscore_hiv25();
725
+
726
+ string* seqa = new string(standard); // reference
727
+ string* seqb = new string(seq); // query
728
+ trim(seqa);
729
+ trim(seqb);
730
+ //degap(seqa); // HyPhy behaviour is to not remove gaps
731
+ //degap(seqb);
732
+ string* newseqa = new string();
733
+ string* newseqb = new string();
734
+
735
+ score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
736
+
737
+ PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
738
+ delete seqa;
739
+ delete seqb;
740
+ delete newseqa;
741
+ delete newseqb;
742
+
743
+ return retval;
744
+ }
745
+
746
+ static PyObject * align_it_aa_rb(PyObject * self, PyObject * args)
747
+ {
748
+ // emulate Ruby implementation of align_it_aa
749
+ const char * standard;
750
+ const char * seq;
751
+ int gap_init_penalty;
752
+ int gap_extend_penalty;
753
+
754
+ if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
755
+ return NULL;
756
+ }
757
+
758
+ init_pairscore_aa(4, -2);
759
+
760
+ string* seqa = new string(standard); // reference
761
+ string* seqb = new string(seq); // query
762
+ trim(seqa);
763
+ trim(seqb);
764
+ degap(seqa);
765
+ degap(seqb);
766
+ string* newseqa = new string();
767
+ string* newseqb = new string();
768
+
769
+ align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
770
+
771
+ PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
772
+ delete seqa;
773
+ delete seqb;
774
+ delete newseqa;
775
+ delete newseqb;
776
+
777
+ return retval;
778
+ }
779
+
780
+ static PyMethodDef AlignmentMethods [] =
781
+ {
782
+ {"align_it", align_it, METH_VARARGS, "Pairwise alignment of nucleotide sequences."},
783
+ {"align_it_rb", align_it_rb, METH_VARARGS, "Pairwise alignment of nucleotide sequences using ReCall settings."},
784
+ {"align_it_aa", align_it_aa, METH_VARARGS, "Pairwise alignment of protein sequences using empirical HIV 25% score matrix."},
785
+ {"align_it_aa_rb", align_it_aa_rb, METH_VARARGS, "Pairwise alignment of protein sequences using ReCall settings."},
786
+ {NULL, NULL, 0, NULL}
787
+ };
788
+
789
+ static struct PyModuleDef AlignmentModuleDef = {
790
+ PyModuleDef_HEAD_INIT,
791
+ "gotoh",
792
+ NULL,
793
+ -1,
794
+ AlignmentMethods,
795
+ NULL,
796
+ NULL,
797
+ NULL,
798
+ NULL
799
+ };
800
+
801
+ PyMODINIT_FUNC PyInit_gotoh(void) {
802
+ return PyModule_Create(&AlignmentModuleDef);
803
+ }
804
+
805
+ #else
806
+ /* Ruby wrapper functions */
807
+ extern "C" VALUE align_it(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
808
+ {
809
+ init_pairscore(1, 1);
810
+
811
+ string* seqa = new string(RSTRING_PTR(standard));
812
+ string* seqb = new string(RSTRING_PTR(seq));
813
+ trim(seqa);
814
+ trim(seqb);
815
+ degap(seqa);
816
+ degap(seqb);
817
+ string* newseqa = new string();
818
+ string* newseqb = new string();
819
+ align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
820
+
821
+ VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
822
+
823
+ delete seqa;
824
+ delete seqb;
825
+ delete newseqa;
826
+ delete newseqb;
827
+
828
+ return ret;
829
+ }
830
+
831
+ extern "C" VALUE align_it_aa(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
832
+ {
833
+ init_pairscore_aa(4, -2);
834
+
835
+ string* seqa = new string(RSTRING_PTR(standard));
836
+ string* seqb = new string(RSTRING_PTR(seq));
837
+ trim(seqa);
838
+ trim(seqb);
839
+ degap(seqa);
840
+ degap(seqb);
841
+ string* newseqa = new string();
842
+ string* newseqb = new string();
843
+ align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
844
+
845
+ VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
846
+
847
+ delete seqa;
848
+ delete seqb;
849
+ delete newseqa;
850
+ delete newseqb;
851
+
852
+ return ret;
853
+ }
854
+
855
+ extern "C" void Init_cfe_gotoh()
856
+ {
857
+ VALUE gotoh = rb_define_module("CfeGotoh");
858
+ rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4);
859
+ rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4);
860
+ }
861
+
862
+ #endif
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+
3
+ create_header
4
+ create_makefile('cfe_gotoh/cfe_gotoh')
data/lib/cfe_gotoh.rb ADDED
@@ -0,0 +1,413 @@
1
+ #TODO: Scoring algorithm to improve frame_align?
2
+
3
+ require_relative 'cfe_gotoh/cfe_gotoh'
4
+
5
+
6
+ module CfeGotoh
7
+ class Error < RuntimeError
8
+ end
9
+
10
+ class GapMergeError < Error
11
+ end
12
+
13
+ sub_matrix = Array.new(127) {Array.new(127) {-1.0} }
14
+ ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc|
15
+ sub_matrix[nuc.ord()][nuc.ord()] = 1.0
16
+ sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N')
17
+ end
18
+ #bi-mixtures
19
+ sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0
20
+ sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0
21
+ sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0
22
+ sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0
23
+ sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0
24
+ sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0
25
+ sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0
26
+ sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0
27
+ sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0
28
+ sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0
29
+ sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0
30
+ sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0
31
+ #tri-mixtures
32
+ sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0
33
+ sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0
34
+ sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0
35
+ sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0
36
+ sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0
37
+ sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0
38
+ sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0
39
+ sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0
40
+ sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0
41
+ sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0
42
+ sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0
43
+ sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0
44
+ #other
45
+ sub_matrix['$'.ord()]['$'.ord()]=50.0
46
+ sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0
47
+ sub_matrix['N'.ord()]['N'.ord()] = 0.0
48
+ sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0
49
+ ['A','T','G','C'].each do |ch|
50
+ sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0
51
+ sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7
52
+ sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0
53
+ sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0
54
+ sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0
55
+ end
56
+ sub_matrix.each {|column| column.freeze}
57
+ sub_matrix.freeze
58
+
59
+ NUCLEOTIDE_MATRIX = sub_matrix
60
+
61
+ def self.score_alignment(standard, query)
62
+ sc = 0.0
63
+ 0.upto(standard.size() - 1) do |i|
64
+ sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()]
65
+ end
66
+ return sc
67
+ end
68
+
69
+ def self.make_gap_list(seq)
70
+ list = []
71
+ cur_ins = nil
72
+ prev_i = nil
73
+ 0.upto(seq.size() - 1) do |i|
74
+ if(seq[i,1] == '-')
75
+ if(prev_i and i == prev_i + 1)
76
+ cur_ins << i
77
+ prev_i = i
78
+ else
79
+ list << cur_ins if(cur_ins != nil and cur_ins != [])
80
+ cur_ins = [i]
81
+ prev_i = i
82
+ end
83
+ end
84
+ end
85
+ list << cur_ins if(cur_ins != nil and cur_ins != [])
86
+ return list
87
+ end
88
+
89
+ def self.trim_leading_dashes(standard, query)
90
+ leading_dashes_match = /^(-+)[^-]/.match(standard)
91
+ if (leading_dashes_match.nil?)
92
+ return
93
+ end
94
+ leading_dashes = leading_dashes_match[1]
95
+ standard[0, leading_dashes.size()] = ''
96
+ query[0, leading_dashes.size()] = ''
97
+ end
98
+
99
+ def self.trim_trailing_dashes(standard, query)
100
+ trailing_dashes_match = /[^-](-+)$/.match(standard)
101
+ if (trailing_dashes_match.nil?)
102
+ return
103
+ end
104
+ trailing_dashes = trailing_dashes_match[1]
105
+ end_of_standard = standard.size() - trailing_dashes.size()
106
+ standard[end_of_standard, trailing_dashes.size()] = ''
107
+ query[end_of_standard, trailing_dashes.size()] = ''
108
+ end
109
+
110
+ def self.fix_incomplete_edge_codon(query, side=:leading)
111
+ edge_idx = 0
112
+ dash_regex = /^(-+)[^-]/
113
+ incr = 1
114
+ if (side != :leading) # fix the trailing edge
115
+ edge_idx = -1
116
+ dash_regex = /[^-](-+)$/
117
+ incr = -1
118
+ end
119
+
120
+ if (query[edge_idx] == '-')
121
+ dashes = dash_regex.match(query)[1] # we know there will be a match
122
+
123
+ # If the length of the dashes aren't a multiple of 3, turn some
124
+ # of the query characters into dashes to force it to be a full
125
+ # codon of dashes.
126
+ if (dashes.size() % 3 >= 1)
127
+ first_non_dash_idx = dashes.size()
128
+ if (side != :leading)
129
+ first_non_dash_idx = query.size() - dashes.size() - 1
130
+ end
131
+ query[first_non_dash_idx] = '-'
132
+ if (dashes.size() % 3 == 1)
133
+ query[first_non_dash_idx + incr] = '-'
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ def self.merge_insertions_and_deletions_to_fix_oof_sequences(
140
+ standard,
141
+ query
142
+ )
143
+ # Merge deletions and insertions until the sequences have a cogent length
144
+ # (i.e. have length divisible by 3). This helps fix poor insertions near
145
+ # the start of the sequence.
146
+ raise 'Standard and query should be the same length' if standard.size() != query.size()
147
+ if(standard.size() % 3 != 0)
148
+ dex = 0
149
+ while(dex = standard.index(/-/, dex))
150
+ [-1, 1, -2, 2].each do |offset| # look one base away, then two bases away
151
+ if ((dex + offset >= 0) and query[dex + offset] == '-')
152
+ standard[dex] = ''
153
+ query[dex + offset] = ''
154
+ dex = 0
155
+ break
156
+ end
157
+ end
158
+
159
+ # Stop if the sequences are now a cogent length.
160
+ if(standard.size() % 3 == 0)
161
+ break
162
+ end
163
+ dex += 1
164
+ end
165
+ end
166
+ end
167
+
168
+ def self.cluster_gaps(gaps, raise_errors=false)
169
+ # Merge adjacent gaps if they are not a codon-sized gap.
170
+ new_gap_list = []
171
+ gaps.each_with_index do |gap, i|
172
+ next if(gap.size() == 0) # we already ate this one
173
+ if(gap.size() % 3 == 0) # this gap is fine!
174
+ new_gap_list << gap
175
+ next
176
+ end
177
+
178
+ gap2 = gaps[i + 1] # note: these could be nil, which we test for below
179
+ gap3 = gaps[i + 2]
180
+ # Can I merge with the next gap?
181
+ if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9)
182
+ if(gap2.size() > gap.size())
183
+ new_gap_list << ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2
184
+ else
185
+ new_gap_list << gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a()
186
+ end
187
+ gaps[i + 1] = []
188
+ # Can I merge with the next two gaps?
189
+ elsif(
190
+ gap2 and gap3 and
191
+ (gap + gap2 + gap3).size() % 3 == 0 and
192
+ (gap3.first - gap.last) < 12
193
+ )
194
+ # Place the gap around the middle of the three merging gaps.
195
+ new_gap = (
196
+ ((gap2.first - gap.size()) .. gap2.first - 1).to_a() +
197
+ gap2 +
198
+ ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a()
199
+ )
200
+ new_gap_list << new_gap
201
+
202
+ gaps[i + 1] = []
203
+ gaps[i + 2] = []
204
+ else
205
+ # We can't merge the gaps; either raise an error or meekly proceed.
206
+ if (raise_errors)
207
+ raise GapMergeError
208
+ else
209
+ new_gap_list << gap # FIXME this behaviour differs between insertions and deletions
210
+ end
211
+ end
212
+ end
213
+ return new_gap_list
214
+ end
215
+
216
+ def self.align_gaps_to_frame(gaps, common_gap_locations=nil)
217
+ # Align gaps to codon boundaries, giving preference to common
218
+ # gap locations if specified.
219
+ # Gaps must be listed in ascending order, i.e. from left to right.
220
+
221
+ offset = 0 # offset created by previous gaps.
222
+ gaps.each do |gap|
223
+ # See if this gap is close to a common gap location (within 3 amino acids).
224
+ if (!common_gap_locations.nil?)
225
+ closest_common = common_gap_locations.min() do |a, b|
226
+ (3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs()
227
+ end
228
+ if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9)
229
+ # Align the gap to this position.
230
+ new_gap = []
231
+ 0.upto(gap.size() - 1) do |i|
232
+ new_gap << 3 * closest_common + i + offset
233
+ end
234
+ gap.replace(new_gap)
235
+ end
236
+ end
237
+
238
+ # Align the gap to the nearest appropriate frame.
239
+ # Original comment from Conan: scoring would be good here
240
+ if(gap[0] % 3 == 1) # set back one base
241
+ new_gap = []
242
+ gap.each do |i|
243
+ new_gap << i - 1
244
+ end
245
+ gap.replace(new_gap)
246
+ elsif(gap[0] % 3 == 2) # set forward one base
247
+ new_gap = []
248
+ gap.each do |i|
249
+ new_gap << i + 1
250
+ end
251
+ gap.replace(new_gap)
252
+ end
253
+
254
+ offset += gap.size()
255
+ end
256
+ return gaps
257
+ end
258
+
259
+ def self.splice_gaps_into_sequence(seq, gaps)
260
+ # Place the specified gaps into the sequence. Note that the
261
+ # gaps are specified by their positions in an *aligned* sequence,
262
+ # and as such include "offsets" introduced by gaps placed earlier
263
+ # in the sequence. The gaps must be in left-to-right order.
264
+ seq = seq.gsub('-','')
265
+ gaps.each do |gap|
266
+ gap.each do |i|
267
+ if(i > seq.size())
268
+ seq.insert(-1, '-')
269
+ else
270
+ seq.insert(i, '-')
271
+ end
272
+ end
273
+ end
274
+ return seq
275
+ end
276
+
277
+ #common_insert_locations is based on amino acid locations starting at base 0.
278
+ #Assumes standard in the first base.
279
+ #Prealign lets you run a lot of the corrections and qc on a already aligned sequence.
280
+ def self.frame_align(
281
+ standard,
282
+ query,
283
+ gap_init=3,
284
+ gap_penalty=1,
285
+ common_insert_locations=nil,
286
+ trim=false,
287
+ raise_errors=false,
288
+ prealigned=false
289
+ )
290
+ if (common_insert_locations.nil?)
291
+ common_insert_locations = []
292
+ end
293
+ if(!prealigned)
294
+ elem = align_it(standard, query, gap_init, gap_penalty)
295
+ standard = elem[0]
296
+ query = elem[1]
297
+ end
298
+ raise "Standard and query should be the same length" if standard.size() != query.size()
299
+
300
+ # Trim leading and trailing dashes if desired.
301
+ if (trim)
302
+ trim_leading_dashes(standard, query)
303
+ trim_trailing_dashes(standard, query)
304
+ fix_incomplete_edge_codon(query, :leading)
305
+ fix_incomplete_edge_codon(query, :trailing)
306
+ end
307
+
308
+ merge_insertions_and_deletions_to_fix_oof_sequences(standard, query)
309
+
310
+ if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
311
+ raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3"
312
+ end
313
+ if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
314
+ raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3"
315
+ end
316
+
317
+ # Build the insert/delete lists. These lists look like
318
+ # [[3,4,5], [9], [11,12]]
319
+ insert_list = make_gap_list(standard)
320
+ delete_list = make_gap_list(query)
321
+
322
+ # Process the insertions.
323
+ if(insert_list.size() > 0)
324
+ new_ins_list = []
325
+
326
+ # Step 1: cluster the insertions.
327
+ begin
328
+ new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)
329
+ rescue GapMergeError
330
+ raise "Cannot frame align insert" if raise_errors
331
+ end
332
+
333
+ # Step 2: frame-align the insertions, shifting things to common insertion
334
+ # positions where appropriate.
335
+ align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations)
336
+
337
+ # Put the insertions back into the standard.
338
+ standard = splice_gaps_into_sequence(standard, new_ins_list)
339
+ end
340
+
341
+ # Process the deletions.
342
+ if(delete_list.size() > 0)
343
+ new_del_list = []
344
+
345
+ # As above, step 1 is to cluster the deletions.
346
+ # FIXME note that the original code behaved differently between
347
+ # insertions and deletions; confirm that this is the right
348
+ # way forward.
349
+ begin
350
+ new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors)
351
+ rescue GapMergeError
352
+ raise "Cannot frame align deletion" if raise_errors
353
+ end
354
+
355
+ # Again as above, frame-align the deletions; this time
356
+ # we don't worry about any common deletion positions.
357
+ align_gaps_to_frame(new_del_list)
358
+
359
+ # Put the deletions back into the query.
360
+ query = splice_gaps_into_sequence(query, new_del_list)
361
+ end
362
+
363
+ return [standard, query]
364
+ end
365
+
366
+ #Returns a [seq_sans_inserts, [list of inserts]]
367
+ def self.remove_inserts(elem)
368
+ return remove_insertions_from_query(elem[0], elem[1])
369
+ end
370
+
371
+ def self.remove_insertions_from_query(standard, query)
372
+ seq = '' + query
373
+ inserts = []
374
+
375
+ insert_list = []
376
+ 0.upto(standard.size() - 1) do |i|
377
+ insert_list << i if(standard[i,1] == '-')
378
+ end
379
+
380
+ big_insert_list = []
381
+ if(standard.include?('-'))#Inserts first
382
+ #First step should be to cluster inserts
383
+ cur_ins = nil
384
+ prev_i = nil
385
+ insert_list.each do |i|
386
+ if(prev_i and i == prev_i + 1)
387
+ cur_ins << i
388
+ prev_i = i
389
+ else
390
+ big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
391
+ cur_ins = [i]
392
+ prev_i = i
393
+ end
394
+ end
395
+ big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
396
+ end
397
+
398
+ offset = 0
399
+ big_insert_list.each do |ins|
400
+ ins_seq = ''
401
+ ins.each do |i|
402
+ ins_seq += query[i,1]
403
+ end
404
+ inserts << [((ins[0] - offset) / 3), ins_seq]
405
+ offset += ins.size()
406
+ ins.each do |i|
407
+ seq[i,1] = '.'
408
+ end
409
+ end
410
+
411
+ return [seq.gsub('.',''), inserts]
412
+ end
413
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cfe_gotoh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0.pre
5
+ platform: ruby
6
+ authors:
7
+ - Conan Woods
8
+ - Jamie Kai
9
+ - David Rickett
10
+ - Richard Liang
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2024-11-22 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description:
17
+ email:
18
+ executables: []
19
+ extensions:
20
+ - ext/cfe_gotoh/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - ext/cfe_gotoh/cfe_gotoh.cpp
24
+ - ext/cfe_gotoh/extconf.rb
25
+ - lib/cfe_gotoh.rb
26
+ homepage:
27
+ licenses: []
28
+ metadata:
29
+ github_repo: ssh://github.com/cfe-lab/gotoh
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 1.3.1
44
+ requirements: []
45
+ rubygems_version: 3.0.9
46
+ signing_key:
47
+ specification_version: 4
48
+ summary: CfE implementation of the Gotoh sequence alignment algorithm
49
+ test_files: []