cfe_gotoh 0.4.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 69fd99e925d82fb14d1035e8461fd32d6d83895ae9bf012efea277596772e7ce
4
+ data.tar.gz: 92b0780c678fa34b4be4d2148df83ed1f60c9d63c15a5742f3c17fa1cb4beb24
5
+ SHA512:
6
+ metadata.gz: ee778affdf1b42aca92b9acbedf6368df7b906a0b5ab6793132b126f04b781baa2a88a04254b0aa9c51a2f2dd3b09b9b0713edbba9ba85db86e26b6eaa0a5745
7
+ data.tar.gz: 01f949ecf25c278706fe4718b59e9ac3aa6e643d8ab3063a5fa1aba390d7f2fd3b28e7dbe1703fce94a84affd7b6573e1bbac29f117bb3442d7a6ef199fd8cbe
@@ -0,0 +1,862 @@
1
+ #include <string>
2
+
3
+ #ifdef __PYTHON__
4
+ #include <Python.h>
5
+ #else
6
+ #include "ruby.h"
7
+
8
+ #ifndef RSTRING_PTR
9
+ // Ruby 1.8.5 doesn't include this definition
10
+ #define RSTRING_PTR(s) (RSTRING(s)->ptr)
11
+ #endif
12
+ #endif
13
+
14
+ using namespace std;
15
+
16
+ /*
17
+ I think this application should do a complete alignment. Unfortunately aligning is way too slow in perl,
18
+ so I suspect the alignment, merging and possibly the gap widening should be done in c. Another possibility
19
+ is to call the C functions from perl, which would simplify things quite a bit! Unfortunately, I'm not entirely
20
+ confident in that perl can do this seamlessly(unlike nicer languages like ruby and python).
21
+ */
22
+
23
+ void trim(string* seq);
24
+
25
+ static int nucMat[127][127]; // ASCII runs from 0 to 127
26
+ void init_pairscore(int matchscore, int mismatchPenalty)
27
+ {
28
+ for (int i=0; i<127; i++)
29
+ {
30
+ for (int j=0; j<127; j++)
31
+ {
32
+ if (i==j)
33
+ {
34
+ nucMat[i][j]=matchscore;
35
+ }
36
+ else
37
+ {
38
+ nucMat[i][j]=-mismatchPenalty;
39
+ // if ((char)i=='N' || (char)i=='n' || (char)j=='N' || (char)j=='n')
40
+ // {
41
+ // nucMat[i][j]=-mismatchPenalty;
42
+ // }
43
+ }
44
+ }
45
+ }
46
+
47
+ // adjust naive assignments for case-insensitivity
48
+ nucMat['a']['A']=nucMat['A']['a']=matchscore;
49
+ nucMat['c']['C']=nucMat['C']['c']=matchscore;
50
+ nucMat['g']['G']=nucMat['G']['g']=matchscore;
51
+ nucMat['t']['T']=nucMat['T']['t']=nucMat['u']['U']=nucMat['U']['u']=matchscore;
52
+ nucMat['t']['u']=nucMat['t']['U']=nucMat['T']['u']=nucMat['T']['U']=matchscore;
53
+ nucMat['u']['t']=nucMat['t']['T']=nucMat['U']['t']=nucMat['U']['T']=matchscore;
54
+ nucMat['N']['N']=nucMat['n']['N']=nucMat['N']['n']=0;
55
+
56
+
57
+ //bi-mixtures
58
+ nucMat['A']['R']=nucMat['R']['A']=matchscore;
59
+ nucMat['G']['R']=nucMat['R']['G']=matchscore;
60
+
61
+ nucMat['C']['Y']=nucMat['Y']['C']=matchscore;
62
+ nucMat['T']['Y']=nucMat['Y']['T']=matchscore;
63
+
64
+ nucMat['G']['K']=nucMat['K']['G']=matchscore;
65
+ nucMat['T']['K']=nucMat['K']['T']=matchscore;
66
+
67
+ nucMat['C']['M']=nucMat['M']['C']=matchscore;
68
+ nucMat['A']['M']=nucMat['M']['A']=matchscore;
69
+
70
+ nucMat['C']['S']=nucMat['S']['C']=matchscore;
71
+ nucMat['G']['S']=nucMat['S']['G']=matchscore;
72
+
73
+ nucMat['T']['W']=nucMat['W']['T']=matchscore;
74
+ nucMat['A']['W']=nucMat['W']['A']=matchscore;
75
+
76
+ //tri-mixtures
77
+ nucMat['C']['B']=nucMat['B']['C']=matchscore;
78
+ nucMat['G']['B']=nucMat['B']['G']=matchscore;
79
+ nucMat['T']['B']=nucMat['B']['T']=matchscore;
80
+
81
+ nucMat['A']['D']=nucMat['D']['A']=matchscore;
82
+ nucMat['G']['D']=nucMat['D']['G']=matchscore;
83
+ nucMat['T']['D']=nucMat['D']['T']=matchscore;
84
+
85
+ nucMat['A']['H']=nucMat['H']['A']=matchscore;
86
+ nucMat['C']['H']=nucMat['H']['C']=matchscore;
87
+ nucMat['T']['H']=nucMat['H']['T']=matchscore;
88
+
89
+ nucMat['A']['V']=nucMat['V']['A']=matchscore;
90
+ nucMat['C']['V']=nucMat['V']['C']=matchscore;
91
+ nucMat['G']['V']=nucMat['V']['G']=matchscore;
92
+
93
+ //Wild cards
94
+ nucMat['*']['A']=nucMat['*']['a']=nucMat['A']['*']=nucMat['a']['*']=matchscore;
95
+ nucMat['*']['C']=nucMat['*']['c']=nucMat['C']['*']=nucMat['c']['*']=matchscore;
96
+ nucMat['*']['T']=nucMat['*']['t']=nucMat['T']['*']=nucMat['t']['*']=matchscore;
97
+ nucMat['*']['G']=nucMat['*']['g']=nucMat['G']['*']=nucMat['g']['*']=matchscore;
98
+
99
+ nucMat['$']['$']=50;
100
+ // nucMat['$']['A']=nucMat['$']['a']=nucMat['A']['$']=nucMat['a']['$']=0;
101
+ // nucMat['$']['T']=nucMat['$']['t']=nucMat['T']['$']=nucMat['t']['$']=0;
102
+ // nucMat['$']['G']=nucMat['$']['g']=nucMat['G']['$']=nucMat['g']['$']=0;
103
+
104
+ //For those annoying duplicate phred values.
105
+ nucMat['.']['A']=nucMat['.']['a']=nucMat['A']['.']=nucMat['a']['.']=-20;
106
+ nucMat['.']['C']=nucMat['.']['c']=nucMat['C']['.']=nucMat['c']['.']=-20;
107
+ nucMat['.']['T']=nucMat['.']['t']=nucMat['T']['.']=nucMat['t']['.']=-20;
108
+ nucMat['.']['G']=nucMat['.']['g']=nucMat['G']['.']=nucMat['g']['.']=-20;
109
+
110
+ nucMat['N']['A']=nucMat['N']['a']=nucMat['A']['N']=nucMat['a']['N']=-3;
111
+ nucMat['N']['C']=nucMat['N']['c']=nucMat['C']['N']=nucMat['c']['N']=-3;
112
+ nucMat['N']['T']=nucMat['N']['t']=nucMat['T']['N']=nucMat['t']['N']=-3;
113
+ nucMat['N']['G']=nucMat['N']['g']=nucMat['G']['N']=nucMat['g']['N']=-3;
114
+
115
+ //for easy alignment to a standard with gaps
116
+ nucMat['X']['A']=nucMat['X']['a']=nucMat['A']['X']=nucMat['a']['X']=-6;
117
+ nucMat['X']['C']=nucMat['X']['c']=nucMat['C']['X']=nucMat['c']['X']=-6;
118
+ nucMat['X']['T']=nucMat['X']['t']=nucMat['T']['X']=nucMat['t']['X']=-6;
119
+ nucMat['X']['G']=nucMat['X']['g']=nucMat['G']['X']=nucMat['g']['X']=-6;
120
+ nucMat['X']['R']=nucMat['X']['r']=nucMat['R']['X']=nucMat['r']['X']=-6;
121
+ nucMat['X']['Y']=nucMat['X']['y']=nucMat['Y']['X']=nucMat['y']['X']=-6;
122
+ nucMat['X']['K']=nucMat['X']['k']=nucMat['K']['X']=nucMat['k']['X']=-6;
123
+ nucMat['X']['M']=nucMat['X']['m']=nucMat['M']['X']=nucMat['m']['X']=-6;
124
+ nucMat['X']['S']=nucMat['X']['s']=nucMat['S']['X']=nucMat['s']['X']=-6;
125
+ nucMat['X']['W']=nucMat['X']['w']=nucMat['W']['X']=nucMat['w']['X']=-6;
126
+ nucMat['X']['B']=nucMat['X']['b']=nucMat['B']['X']=nucMat['b']['X']=-6;
127
+ nucMat['X']['D']=nucMat['X']['d']=nucMat['D']['X']=nucMat['d']['X']=-6;
128
+ nucMat['X']['H']=nucMat['X']['h']=nucMat['H']['X']=nucMat['h']['X']=-6;
129
+ nucMat['X']['V']=nucMat['X']['v']=nucMat['V']['X']=nucMat['v']['X']=-6;
130
+ nucMat['X']['-']=nucMat['X']['-']=3;
131
+ }
132
+
133
+
134
+ void init_pairscore_aa(int matchscore, int mismatchPenalty)
135
+ {
136
+ for (int i=0; i<127; i++)
137
+ {
138
+ for (int j=0; j<127; j++)
139
+ {
140
+ if(i==j)
141
+ {
142
+ nucMat[i][j]=matchscore;
143
+ }
144
+ else
145
+ {
146
+ nucMat[i][j]=-mismatchPenalty;
147
+ if((char)i=='X' || (char)j=='X')
148
+ {
149
+ nucMat[i][j]=-4;
150
+ }
151
+ }
152
+ }
153
+ }
154
+
155
+ nucMat['Z']['Z']=nucMat['z']['Z']=nucMat['Z']['z']=0;
156
+ nucMat['X']['-']=nucMat['-']['X']=matchscore;
157
+ }
158
+
159
+
160
+ /*
161
+ Empirical score matrix based on 25% divergent HIV sequences
162
+ See Nickle, David C., et al. "HIV-specific probabilistic models of protein evolution."
163
+ PLoS One 2.6 (2007): e503.
164
+ */
165
+ static int empirical_hiv25[24][24] = {\
166
+ {7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},\
167
+ {-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},\
168
+ {-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},\
169
+ {-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},\
170
+ {-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},\
171
+ {-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},\
172
+ {-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},\
173
+ {-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},\
174
+ {-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},\
175
+ {-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},\
176
+ {-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},\
177
+ {-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},\
178
+ {-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},\
179
+ {-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},\
180
+ {-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},\
181
+ {-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},\
182
+ {1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},\
183
+ {-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},\
184
+ {-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},\
185
+ {0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},\
186
+ {-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},\
187
+ {-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},\
188
+ {-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},\
189
+ {-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
190
+
191
+ void init_pairscore_hiv25(void) {
192
+ // ASCII codes for protein alphabet ARNDCQEGHILKMFPSTWYVBZ?*
193
+ int aa_to_ascii[24] = { 65, 82, 78, 68, 67, 81, 69, 71, 72, 73, 76, 75, 77, 70, 80, 83, 84,
194
+ 87, 89, 86, 66, 90, 63, 42 };
195
+ int i2, j2;
196
+
197
+ // reset score matrix to be safe
198
+ for (int i=0; i<127; i++) {
199
+ for (int j=0; j<127; j++) {
200
+ nucMat[i][j] = 0;
201
+ }
202
+ }
203
+
204
+ // map HIV 25% empirical matrix to score matrix
205
+ for (int i=0; i<24; i++) {
206
+ i2 = aa_to_ascii[i];
207
+ for (int j=0; j<24; j++) {
208
+ j2 = aa_to_ascii[j];
209
+ // also map to lowercase
210
+ nucMat[i2+32][j2+32] = nucMat[i2+32][j2] = nucMat[i2][j2+32] = nucMat[i2][j2] = empirical_hiv25[i][j];
211
+ }
212
+ }
213
+ }
214
+
215
+
216
+ extern int pairscore(char a, char b)
217
+ {
218
+ return nucMat[a][b];
219
+ }
220
+
221
+ void reverse(string* seq)
222
+ {
223
+ string tmp = "";
224
+ for(int i = seq->size() - 1; i >= 0; --i)
225
+ {
226
+ tmp += (*seq)[i];
227
+ }
228
+ *seq = tmp;
229
+ }
230
+
231
+
232
+ //Error must be somewhere in here. Ug...
233
+ int align(string* seqa, string* seqb, string* newseqa, string* newseqb,
234
+ int gip, int gep, int use_terminal_gap_penalty)
235
+ {
236
+ /*
237
+ Pairwise alignment with affine gap penalty.
238
+ see Gotoh, Osamu. "Optimal alignment between groups of sequences and its application
239
+ to multiple sequence alignment." Computer applications in the biosciences: CABIOS 9.3
240
+ (1993): 361-370.
241
+
242
+ Gap open and extension penalties [gip] and [gep] are assumed to take positive values.
243
+ */
244
+
245
+ int M = seqa->size(); // first group of pre-aligned sequences
246
+ int N = seqb->size(); // second group
247
+
248
+ // if empty ref, return seqb as-is, and seqa as gaps of size(seqb)
249
+ // prevents a buffer overflow in the traceback matrices which assume M>0
250
+ if (M==0)
251
+ {
252
+ int j;
253
+ int alignment_score=0;
254
+ for (j=0 ; j < N ; j++)
255
+ {
256
+ //skip terminal (whole seq) gap penalties if user specifies this option
257
+ if (use_terminal_gap_penalty==0) alignment_score += (j==0) ? (gip+gep) : gep ;
258
+ *newseqa += '-';
259
+ *newseqb += (*seqb)[j];;
260
+ }
261
+
262
+ return alignment_score;
263
+ }
264
+
265
+ int i, j;
266
+
267
+ // not all elements of D, P, and Q need to be stored - vectors are adequate
268
+ int *SS=new int[N+1]; // D(i, .)
269
+ int *oldSS=new int[N+1]; // D(i-1, .)
270
+ int *PP = new int[N+1]; // P(i, .)
271
+
272
+ // Gotoh traceback matrices
273
+ int **piSS = new int*[M+1];
274
+ int **pjSS = new int*[M+1];
275
+
276
+ int u = -gip; // affine gap initiation penalty
277
+ int v = -gep; // affine gap extension penalty
278
+
279
+ int w1 = u + v; // gap weight w_k = v * k + u for k = 1
280
+ int t = u;
281
+ int s, q;
282
+
283
+ // initialize vectors
284
+ for (j=0; j<N+1; j++)
285
+ {
286
+ SS[j]=0;
287
+ oldSS[j]=0;
288
+ PP[j]=0;
289
+ }
290
+
291
+ // initialize traceback matrices
292
+ piSS[0] = new int[N+1];
293
+ pjSS[0] = new int[N+1];
294
+ piSS[1] = new int[N+1];
295
+ pjSS[1] = new int[N+1];
296
+ piSS[1][0] = 0;
297
+ pjSS[1][0] = 0;
298
+ piSS[0][1] = 0;
299
+ pjSS[0][1] = 0;
300
+
301
+ int maxiS = -100000;
302
+ int maxjS = -100000;
303
+ int maxij, maxji;
304
+
305
+ for (i=1; i < M+1; i++)
306
+ {
307
+ t += v; // update gap extension
308
+ s = t;
309
+ SS[0]=0;
310
+ q = t + u;
311
+
312
+ // add new rows
313
+ if (i>1)
314
+ {
315
+ piSS[i] = new int[N+1];
316
+ pjSS[i] = new int[N+1];
317
+ }
318
+
319
+ for (j = 1; j < N + 1; j++)
320
+ {
321
+ // recursive calculation of Q
322
+ if (q >= s + u )
323
+ q += v; // extension
324
+ else
325
+ q = s + u + v; // open
326
+
327
+ // recursive calculation of P
328
+ if ((oldSS[j] + w1) > (PP[j] + v))
329
+ PP[j] = oldSS[j] + w1;
330
+ else
331
+ PP[j] += v;
332
+
333
+ int tmp_pp = PP[j];
334
+
335
+ // D(i-1, j-1) + d(a_i, b_j)
336
+ int pscore = oldSS[j - 1] + pairscore((*seqa)[i - 1], (*seqb)[j - 1]);
337
+
338
+ //no idea if this will work, but its supposed to be a stop codon aligner
339
+ //the bonus is assigned on the last codon, if the codons between don't make a big difference it'll be wrong. Hrm.
340
+
341
+ if(i >= 3 && j >= 3 && (*seqa)[i-3] == '$' && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' &&
342
+ (((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'G') ||
343
+ ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'A') ||
344
+ ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'G' && (*seqb)[j-1] == 'A') ))
345
+ {
346
+ pscore += 6;
347
+ }
348
+ if(i >= 3 && j >= 3 && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' &&
349
+ (((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'G') ||
350
+ ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'A') ||
351
+ ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'G' && (*seqb)[j-0] == 'A') ))
352
+ {
353
+ pscore += 6;
354
+ }
355
+ if(i >= 3 && j >= 3 && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' && (*seqa)[i+1] == '$' &&
356
+ (((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'G') ||
357
+ ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'A') ||
358
+ ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'G' && (*seqb)[j+1] == 'A') ))
359
+ {
360
+ pscore += 6;
361
+ }
362
+
363
+ /*
364
+ D(i,j) = Min { D(i-1, j-1) + d(a_i, b_j), P(i,j), Q(i,j) }
365
+ where P(i,j) = Min { D(i-k, j) + w_k } for k = 1, .., i
366
+ and Q(i,j) = Min { D(i, j-k) + w_k } for k = 1, ..., j
367
+
368
+ i.e., three options are:
369
+ 1. match/mismatch,
370
+ 2. gap open/extension in sequence (a),
371
+ 3. gap open/extension in sequence (b)
372
+
373
+ pscore = D(i-1, j-1) + d(a_i, b_j)
374
+ tmp_pp = P(i,j)
375
+ q = Q(i,j)
376
+ */
377
+
378
+ //maybe just >?
379
+ if (tmp_pp >= pscore)
380
+ {
381
+ if (tmp_pp > q)
382
+ {
383
+ // gap open / extension in (a)
384
+ s = tmp_pp;
385
+ piSS[i][j] = i - 1;
386
+ pjSS[i][j] = j;
387
+ }
388
+ else // q > tmp_pp > pscore
389
+ {
390
+ // gap open / extension in (b)
391
+ s = q;
392
+ piSS[i][j] = i;
393
+ pjSS[i][j] = j - 1;
394
+ }
395
+ }
396
+ else // pscore > tmp_pp)
397
+ {
398
+ if (pscore > q)
399
+ {
400
+ // match / mismatch
401
+ s = pscore;
402
+ piSS[i][j] = i - 1;
403
+ pjSS[i][j] = j - 1;
404
+ }
405
+ else // q > pscore > tmp_pp
406
+ {
407
+ // gap open / extension in (b)
408
+ s = q;
409
+ piSS[i][j] = i;
410
+ pjSS[i][j] = j - 1;
411
+ }
412
+ }
413
+
414
+ SS[j] = s;
415
+
416
+ if (i == M && SS[j] >= maxiS)
417
+ {
418
+ maxiS = SS[j];
419
+ maxij = j;
420
+ }
421
+ }
422
+
423
+ if (SS[N] >= maxjS)
424
+ {
425
+ maxjS = SS[N];
426
+ maxji = i;
427
+ }
428
+
429
+ for (j = 0; j < N + 1; j++)
430
+ {
431
+ oldSS[j] = SS[j];
432
+ }
433
+ }
434
+
435
+ if (maxij>N)
436
+ maxij=N;
437
+ if (maxji>M)
438
+ maxji=M;
439
+ if (maxij<0)
440
+ maxij=0;
441
+ if (maxji<0)
442
+ maxji=0;
443
+
444
+ //add starting -'s
445
+ int alignment_score;
446
+ if (maxiS > maxjS)
447
+ {
448
+ alignment_score = maxiS;
449
+ i = M;
450
+ j = maxij;
451
+ for (int kk = N; kk > maxij; kk--)
452
+ {
453
+ *newseqb += (*seqb)[kk - 1];
454
+ *newseqa += '-';
455
+ }
456
+ }
457
+ else
458
+ {
459
+ alignment_score = maxjS;
460
+ i = maxji;
461
+ j = N;
462
+ for (int kk = M; kk > maxji; kk--)
463
+ {
464
+ *newseqa += (*seqa)[kk - 1];
465
+ *newseqb += '-';
466
+ }
467
+ }
468
+
469
+ bool decI = false;
470
+ bool decJ = false;
471
+ //inserting -'s in the middle!
472
+ while(i >= 1 && j >= 1)
473
+ {
474
+ decI=false;
475
+ decJ=false;
476
+ if (piSS[i][j] < i)
477
+ {
478
+ *newseqa += (*seqa)[i - 1];
479
+ decI = true;
480
+ }
481
+ else
482
+ {
483
+ *newseqa += '-';
484
+ }
485
+
486
+ if (pjSS[i][j] < j)
487
+ {
488
+ *newseqb += (*seqb)[j - 1];
489
+ decJ=true;
490
+ }
491
+ else
492
+ {
493
+ *newseqb += '-';
494
+ }
495
+
496
+ if (decI)
497
+ {
498
+ i--;
499
+ }
500
+ if (decJ)
501
+ {
502
+ j--;
503
+ }
504
+ }
505
+
506
+ //add extra trailing -'s
507
+ //forgive terminal gap penalties if user specifies this option
508
+ if (i < j)
509
+ {
510
+ for (int jj = j; jj >= 1; jj--)
511
+ {
512
+ *newseqb += (*seqb)[jj - 1];
513
+ *newseqa += '-';
514
+ if (use_terminal_gap_penalty==0) alignment_score += gep;
515
+ }
516
+ if (use_terminal_gap_penalty==0) alignment_score += gip;
517
+ }
518
+ else if(i > j)
519
+ {
520
+ for (int ii = i; ii >= 1; ii--)
521
+ {
522
+ *newseqa += (*seqa)[ii - 1];
523
+ *newseqb += '-';
524
+ if (use_terminal_gap_penalty==0) alignment_score += gep;
525
+ }
526
+ if (use_terminal_gap_penalty==0) alignment_score += gip;
527
+ }
528
+
529
+ reverse(newseqa);
530
+ reverse(newseqb);
531
+
532
+ for (i = 0; i < M + 1; i++)
533
+ {
534
+ delete []piSS[i];
535
+ delete []pjSS[i];
536
+ }
537
+
538
+ delete []SS;
539
+ delete []oldSS;
540
+ delete []piSS;
541
+ delete []pjSS;
542
+ delete []PP;
543
+ return alignment_score;
544
+ }
545
+
546
+ void degap(string* seq)
547
+ {
548
+ /*
549
+ Remove pre-existing gap characters from sequences prior to alignment.
550
+ */
551
+ unsigned int pos = 0;
552
+ while(pos != -1)
553
+ {
554
+ pos = seq->find('-', 0);
555
+ if(pos != -1)
556
+ {
557
+ seq->erase(pos, 1);
558
+ }
559
+ }
560
+ }
561
+
562
+ void trim(string* seq)
563
+ {
564
+ /*
565
+ Remove trailing whitespace from sequences.
566
+ */
567
+ while((*seq)[0] == ' ' || (*seq)[0] == '\t' || (*seq)[0] == '\n' || (*seq)[0] == '\r')
568
+ {
569
+ seq->erase(0, 1);
570
+ }
571
+
572
+ while((*seq)[seq->size() - 1] == ' ' || (*seq)[seq->size() - 1] == '\t' || (*seq)[seq->size() - 1] == '\n' || (*seq)[seq->size() - 1] == '\r')
573
+ {
574
+ seq->erase(seq->size() - 1, 1);
575
+ }
576
+ }
577
+
578
+
579
+ void widen_gaps(string* seq)
580
+ {
581
+ int size = seq->size();
582
+ for(int i = 0; i < size; i++)
583
+ {
584
+ if((*seq)[i] == '-')
585
+ { //start searching for gaps to cluster
586
+
587
+ //backwards, seqa
588
+ unsigned int j = i - 1;
589
+ int letter = (*seq)[j];
590
+ j--;
591
+ while(j >= 0)
592
+ {
593
+ if((*seq)[j] == '-')
594
+ {
595
+ //woo, swap this with i - 1
596
+ (*seq)[j] = letter;
597
+ (*seq)[i - 1] = '-';
598
+ break;
599
+ }
600
+ else if((*seq)[j] == letter)
601
+ {
602
+ //nothing really
603
+ }
604
+ else if((*seq)[j] != letter)
605
+ {
606
+ break;
607
+ }
608
+ j--;
609
+ }
610
+
611
+
612
+ //forward, seqa
613
+ j = i + 1;
614
+ letter = (*seq)[j];
615
+ j++;
616
+ while(j < seq->size())
617
+ {
618
+ if((*seq)[j] == '-')
619
+ {
620
+ //woo, swap this with i + 1
621
+ (*seq)[j] = letter;
622
+ (*seq)[i + 1] = '-';
623
+ break;
624
+ }
625
+ else if((*seq)[j] == letter)
626
+ {
627
+ //nothing really
628
+ }
629
+ else if((*seq)[j] != letter)
630
+ {
631
+ break;
632
+ }
633
+ j++;
634
+ }
635
+ }
636
+ }
637
+ }
638
+
639
+ #ifdef __PYTHON__
640
+ /* Python wrapper functions */
641
+ static PyObject * align_it(PyObject * self, PyObject * args)
642
+ {
643
+ const char * standard;
644
+ const char * seq;
645
+ int gap_init_penalty;
646
+ int gap_extend_penalty;
647
+ int use_terminal_gap_penalty;
648
+ int score;
649
+
650
+ if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
651
+ return NULL;
652
+ }
653
+
654
+ init_pairscore(5, 4); // match, mismatch scores +5, -4 respectively (HyPhy defaults)
655
+
656
+ string* seqa = new string(standard);
657
+ string* seqb = new string(seq);
658
+ trim(seqa);
659
+ trim(seqb);
660
+ //degap(seqa);
661
+ //degap(seqb);
662
+ string* newseqa = new string();
663
+ string* newseqb = new string();
664
+
665
+ score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
666
+
667
+ PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
668
+
669
+ delete seqa;
670
+ delete seqb;
671
+ delete newseqa;
672
+ delete newseqb;
673
+
674
+ return retval;
675
+ }
676
+
677
+ static PyObject * align_it_rb(PyObject * self, PyObject * args)
678
+ {
679
+ // emulate Ruby implementation of align_it
680
+ const char * standard;
681
+ const char * seq;
682
+ int gap_init_penalty;
683
+ int gap_extend_penalty;
684
+
685
+ if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
686
+ return NULL;
687
+ }
688
+
689
+ init_pairscore(1, 1);
690
+
691
+ string* seqa = new string(standard); // reference
692
+ string* seqb = new string(seq); // query
693
+ trim(seqa);
694
+ trim(seqb);
695
+ degap(seqa);
696
+ degap(seqb);
697
+ string* newseqa = new string();
698
+ string* newseqb = new string();
699
+
700
+ align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
701
+
702
+ PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
703
+ delete seqa;
704
+ delete seqb;
705
+ delete newseqa;
706
+ delete newseqb;
707
+
708
+ return retval;
709
+ }
710
+
711
+ static PyObject * align_it_aa(PyObject * self, PyObject * args)
712
+ {
713
+ const char * standard;
714
+ const char * seq;
715
+ int gap_init_penalty;
716
+ int gap_extend_penalty;
717
+ int use_terminal_gap_penalty;
718
+ int score;
719
+
720
+ if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
721
+ return NULL;
722
+ }
723
+
724
+ init_pairscore_hiv25();
725
+
726
+ string* seqa = new string(standard); // reference
727
+ string* seqb = new string(seq); // query
728
+ trim(seqa);
729
+ trim(seqb);
730
+ //degap(seqa); // HyPhy behaviour is to not remove gaps
731
+ //degap(seqb);
732
+ string* newseqa = new string();
733
+ string* newseqb = new string();
734
+
735
+ score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
736
+
737
+ PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
738
+ delete seqa;
739
+ delete seqb;
740
+ delete newseqa;
741
+ delete newseqb;
742
+
743
+ return retval;
744
+ }
745
+
746
+ static PyObject * align_it_aa_rb(PyObject * self, PyObject * args)
747
+ {
748
+ // emulate Ruby implementation of align_it_aa
749
+ const char * standard;
750
+ const char * seq;
751
+ int gap_init_penalty;
752
+ int gap_extend_penalty;
753
+
754
+ if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
755
+ return NULL;
756
+ }
757
+
758
+ init_pairscore_aa(4, -2);
759
+
760
+ string* seqa = new string(standard); // reference
761
+ string* seqb = new string(seq); // query
762
+ trim(seqa);
763
+ trim(seqb);
764
+ degap(seqa);
765
+ degap(seqb);
766
+ string* newseqa = new string();
767
+ string* newseqb = new string();
768
+
769
+ align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
770
+
771
+ PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
772
+ delete seqa;
773
+ delete seqb;
774
+ delete newseqa;
775
+ delete newseqb;
776
+
777
+ return retval;
778
+ }
779
+
780
+ static PyMethodDef AlignmentMethods [] =
781
+ {
782
+ {"align_it", align_it, METH_VARARGS, "Pairwise alignment of nucleotide sequences."},
783
+ {"align_it_rb", align_it_rb, METH_VARARGS, "Pairwise alignment of nucleotide sequences using ReCall settings."},
784
+ {"align_it_aa", align_it_aa, METH_VARARGS, "Pairwise alignment of protein sequences using empirical HIV 25% score matrix."},
785
+ {"align_it_aa_rb", align_it_aa_rb, METH_VARARGS, "Pairwise alignment of protein sequences using ReCall settings."},
786
+ {NULL, NULL, 0, NULL}
787
+ };
788
+
789
+ static struct PyModuleDef AlignmentModuleDef = {
790
+ PyModuleDef_HEAD_INIT,
791
+ "gotoh",
792
+ NULL,
793
+ -1,
794
+ AlignmentMethods,
795
+ NULL,
796
+ NULL,
797
+ NULL,
798
+ NULL
799
+ };
800
+
801
+ PyMODINIT_FUNC PyInit_gotoh(void) {
802
+ return PyModule_Create(&AlignmentModuleDef);
803
+ }
804
+
805
+ #else
806
+ /* Ruby wrapper functions */
807
+ extern "C" VALUE align_it(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
808
+ {
809
+ init_pairscore(1, 1);
810
+
811
+ string* seqa = new string(RSTRING_PTR(standard));
812
+ string* seqb = new string(RSTRING_PTR(seq));
813
+ trim(seqa);
814
+ trim(seqb);
815
+ degap(seqa);
816
+ degap(seqb);
817
+ string* newseqa = new string();
818
+ string* newseqb = new string();
819
+ align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
820
+
821
+ VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
822
+
823
+ delete seqa;
824
+ delete seqb;
825
+ delete newseqa;
826
+ delete newseqb;
827
+
828
+ return ret;
829
+ }
830
+
831
+ extern "C" VALUE align_it_aa(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
832
+ {
833
+ init_pairscore_aa(4, -2);
834
+
835
+ string* seqa = new string(RSTRING_PTR(standard));
836
+ string* seqb = new string(RSTRING_PTR(seq));
837
+ trim(seqa);
838
+ trim(seqb);
839
+ degap(seqa);
840
+ degap(seqb);
841
+ string* newseqa = new string();
842
+ string* newseqb = new string();
843
+ align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
844
+
845
+ VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
846
+
847
+ delete seqa;
848
+ delete seqb;
849
+ delete newseqa;
850
+ delete newseqb;
851
+
852
+ return ret;
853
+ }
854
+
855
+ extern "C" void Init_cfe_gotoh()
856
+ {
857
+ VALUE gotoh = rb_define_module("CfeGotoh");
858
+ rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4);
859
+ rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4);
860
+ }
861
+
862
+ #endif
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+
3
+ create_header
4
+ create_makefile('cfe_gotoh/cfe_gotoh')
data/lib/cfe_gotoh.rb ADDED
@@ -0,0 +1,413 @@
1
+ #TODO: Scoring algorithm to improve frame_align?
2
+
3
+ require_relative 'cfe_gotoh/cfe_gotoh'
4
+
5
+
6
+ module CfeGotoh
7
+ class Error < RuntimeError
8
+ end
9
+
10
+ class GapMergeError < Error
11
+ end
12
+
13
+ sub_matrix = Array.new(127) {Array.new(127) {-1.0} }
14
+ ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc|
15
+ sub_matrix[nuc.ord()][nuc.ord()] = 1.0
16
+ sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N')
17
+ end
18
+ #bi-mixtures
19
+ sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0
20
+ sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0
21
+ sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0
22
+ sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0
23
+ sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0
24
+ sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0
25
+ sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0
26
+ sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0
27
+ sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0
28
+ sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0
29
+ sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0
30
+ sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0
31
+ #tri-mixtures
32
+ sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0
33
+ sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0
34
+ sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0
35
+ sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0
36
+ sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0
37
+ sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0
38
+ sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0
39
+ sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0
40
+ sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0
41
+ sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0
42
+ sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0
43
+ sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0
44
+ #other
45
+ sub_matrix['$'.ord()]['$'.ord()]=50.0
46
+ sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0
47
+ sub_matrix['N'.ord()]['N'.ord()] = 0.0
48
+ sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0
49
+ ['A','T','G','C'].each do |ch|
50
+ sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0
51
+ sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7
52
+ sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0
53
+ sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0
54
+ sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0
55
+ end
56
+ sub_matrix.each {|column| column.freeze}
57
+ sub_matrix.freeze
58
+
59
+ NUCLEOTIDE_MATRIX = sub_matrix
60
+
61
+ def self.score_alignment(standard, query)
62
+ sc = 0.0
63
+ 0.upto(standard.size() - 1) do |i|
64
+ sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()]
65
+ end
66
+ return sc
67
+ end
68
+
69
+ def self.make_gap_list(seq)
70
+ list = []
71
+ cur_ins = nil
72
+ prev_i = nil
73
+ 0.upto(seq.size() - 1) do |i|
74
+ if(seq[i,1] == '-')
75
+ if(prev_i and i == prev_i + 1)
76
+ cur_ins << i
77
+ prev_i = i
78
+ else
79
+ list << cur_ins if(cur_ins != nil and cur_ins != [])
80
+ cur_ins = [i]
81
+ prev_i = i
82
+ end
83
+ end
84
+ end
85
+ list << cur_ins if(cur_ins != nil and cur_ins != [])
86
+ return list
87
+ end
88
+
89
+ def self.trim_leading_dashes(standard, query)
90
+ leading_dashes_match = /^(-+)[^-]/.match(standard)
91
+ if (leading_dashes_match.nil?)
92
+ return
93
+ end
94
+ leading_dashes = leading_dashes_match[1]
95
+ standard[0, leading_dashes.size()] = ''
96
+ query[0, leading_dashes.size()] = ''
97
+ end
98
+
99
+ def self.trim_trailing_dashes(standard, query)
100
+ trailing_dashes_match = /[^-](-+)$/.match(standard)
101
+ if (trailing_dashes_match.nil?)
102
+ return
103
+ end
104
+ trailing_dashes = trailing_dashes_match[1]
105
+ end_of_standard = standard.size() - trailing_dashes.size()
106
+ standard[end_of_standard, trailing_dashes.size()] = ''
107
+ query[end_of_standard, trailing_dashes.size()] = ''
108
+ end
109
+
110
+ def self.fix_incomplete_edge_codon(query, side=:leading)
111
+ edge_idx = 0
112
+ dash_regex = /^(-+)[^-]/
113
+ incr = 1
114
+ if (side != :leading) # fix the trailing edge
115
+ edge_idx = -1
116
+ dash_regex = /[^-](-+)$/
117
+ incr = -1
118
+ end
119
+
120
+ if (query[edge_idx] == '-')
121
+ dashes = dash_regex.match(query)[1] # we know there will be a match
122
+
123
+ # If the length of the dashes aren't a multiple of 3, turn some
124
+ # of the query characters into dashes to force it to be a full
125
+ # codon of dashes.
126
+ if (dashes.size() % 3 >= 1)
127
+ first_non_dash_idx = dashes.size()
128
+ if (side != :leading)
129
+ first_non_dash_idx = query.size() - dashes.size() - 1
130
+ end
131
+ query[first_non_dash_idx] = '-'
132
+ if (dashes.size() % 3 == 1)
133
+ query[first_non_dash_idx + incr] = '-'
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ def self.merge_insertions_and_deletions_to_fix_oof_sequences(
140
+ standard,
141
+ query
142
+ )
143
+ # Merge deletions and insertions until the sequences have a cogent length
144
+ # (i.e. have length divisible by 3). This helps fix poor insertions near
145
+ # the start of the sequence.
146
+ raise 'Standard and query should be the same length' if standard.size() != query.size()
147
+ if(standard.size() % 3 != 0)
148
+ dex = 0
149
+ while(dex = standard.index(/-/, dex))
150
+ [-1, 1, -2, 2].each do |offset| # look one base away, then two bases away
151
+ if ((dex + offset >= 0) and query[dex + offset] == '-')
152
+ standard[dex] = ''
153
+ query[dex + offset] = ''
154
+ dex = 0
155
+ break
156
+ end
157
+ end
158
+
159
+ # Stop if the sequences are now a cogent length.
160
+ if(standard.size() % 3 == 0)
161
+ break
162
+ end
163
+ dex += 1
164
+ end
165
+ end
166
+ end
167
+
168
+ def self.cluster_gaps(gaps, raise_errors=false)
169
+ # Merge adjacent gaps if they are not a codon-sized gap.
170
+ new_gap_list = []
171
+ gaps.each_with_index do |gap, i|
172
+ next if(gap.size() == 0) # we already ate this one
173
+ if(gap.size() % 3 == 0) # this gap is fine!
174
+ new_gap_list << gap
175
+ next
176
+ end
177
+
178
+ gap2 = gaps[i + 1] # note: these could be nil, which we test for below
179
+ gap3 = gaps[i + 2]
180
+ # Can I merge with the next gap?
181
+ if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9)
182
+ if(gap2.size() > gap.size())
183
+ new_gap_list << ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2
184
+ else
185
+ new_gap_list << gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a()
186
+ end
187
+ gaps[i + 1] = []
188
+ # Can I merge with the next two gaps?
189
+ elsif(
190
+ gap2 and gap3 and
191
+ (gap + gap2 + gap3).size() % 3 == 0 and
192
+ (gap3.first - gap.last) < 12
193
+ )
194
+ # Place the gap around the middle of the three merging gaps.
195
+ new_gap = (
196
+ ((gap2.first - gap.size()) .. gap2.first - 1).to_a() +
197
+ gap2 +
198
+ ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a()
199
+ )
200
+ new_gap_list << new_gap
201
+
202
+ gaps[i + 1] = []
203
+ gaps[i + 2] = []
204
+ else
205
+ # We can't merge the gaps; either raise an error or meekly proceed.
206
+ if (raise_errors)
207
+ raise GapMergeError
208
+ else
209
+ new_gap_list << gap # FIXME this behaviour differs between insertions and deletions
210
+ end
211
+ end
212
+ end
213
+ return new_gap_list
214
+ end
215
+
216
+ def self.align_gaps_to_frame(gaps, common_gap_locations=nil)
217
+ # Align gaps to codon boundaries, giving preference to common
218
+ # gap locations if specified.
219
+ # Gaps must be listed in ascending order, i.e. from left to right.
220
+
221
+ offset = 0 # offset created by previous gaps.
222
+ gaps.each do |gap|
223
+ # See if this gap is close to a common gap location (within 3 amino acids).
224
+ if (!common_gap_locations.nil?)
225
+ closest_common = common_gap_locations.min() do |a, b|
226
+ (3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs()
227
+ end
228
+ if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9)
229
+ # Align the gap to this position.
230
+ new_gap = []
231
+ 0.upto(gap.size() - 1) do |i|
232
+ new_gap << 3 * closest_common + i + offset
233
+ end
234
+ gap.replace(new_gap)
235
+ end
236
+ end
237
+
238
+ # Align the gap to the nearest appropriate frame.
239
+ # Original comment from Conan: scoring would be good here
240
+ if(gap[0] % 3 == 1) # set back one base
241
+ new_gap = []
242
+ gap.each do |i|
243
+ new_gap << i - 1
244
+ end
245
+ gap.replace(new_gap)
246
+ elsif(gap[0] % 3 == 2) # set forward one base
247
+ new_gap = []
248
+ gap.each do |i|
249
+ new_gap << i + 1
250
+ end
251
+ gap.replace(new_gap)
252
+ end
253
+
254
+ offset += gap.size()
255
+ end
256
+ return gaps
257
+ end
258
+
259
+ def self.splice_gaps_into_sequence(seq, gaps)
260
+ # Place the specified gaps into the sequence. Note that the
261
+ # gaps are specified by their positions in an *aligned* sequence,
262
+ # and as such include "offsets" introduced by gaps placed earlier
263
+ # in the sequence. The gaps must be in left-to-right order.
264
+ seq = seq.gsub('-','')
265
+ gaps.each do |gap|
266
+ gap.each do |i|
267
+ if(i > seq.size())
268
+ seq.insert(-1, '-')
269
+ else
270
+ seq.insert(i, '-')
271
+ end
272
+ end
273
+ end
274
+ return seq
275
+ end
276
+
277
+ #common_insert_locations is based on amino acid locations starting at base 0.
278
+ #Assumes standard in the first base.
279
+ #Prealign lets you run a lot of the corrections and qc on a already aligned sequence.
280
+ def self.frame_align(
281
+ standard,
282
+ query,
283
+ gap_init=3,
284
+ gap_penalty=1,
285
+ common_insert_locations=nil,
286
+ trim=false,
287
+ raise_errors=false,
288
+ prealigned=false
289
+ )
290
+ if (common_insert_locations.nil?)
291
+ common_insert_locations = []
292
+ end
293
+ if(!prealigned)
294
+ elem = align_it(standard, query, gap_init, gap_penalty)
295
+ standard = elem[0]
296
+ query = elem[1]
297
+ end
298
+ raise "Standard and query should be the same length" if standard.size() != query.size()
299
+
300
+ # Trim leading and trailing dashes if desired.
301
+ if (trim)
302
+ trim_leading_dashes(standard, query)
303
+ trim_trailing_dashes(standard, query)
304
+ fix_incomplete_edge_codon(query, :leading)
305
+ fix_incomplete_edge_codon(query, :trailing)
306
+ end
307
+
308
+ merge_insertions_and_deletions_to_fix_oof_sequences(standard, query)
309
+
310
+ if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
311
+ raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3"
312
+ end
313
+ if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
314
+ raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3"
315
+ end
316
+
317
+ # Build the insert/delete lists. These lists look like
318
+ # [[3,4,5], [9], [11,12]]
319
+ insert_list = make_gap_list(standard)
320
+ delete_list = make_gap_list(query)
321
+
322
+ # Process the insertions.
323
+ if(insert_list.size() > 0)
324
+ new_ins_list = []
325
+
326
+ # Step 1: cluster the insertions.
327
+ begin
328
+ new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)
329
+ rescue GapMergeError
330
+ raise "Cannot frame align insert" if raise_errors
331
+ end
332
+
333
+ # Step 2: frame-align the insertions, shifting things to common insertion
334
+ # positions where appropriate.
335
+ align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations)
336
+
337
+ # Put the insertions back into the standard.
338
+ standard = splice_gaps_into_sequence(standard, new_ins_list)
339
+ end
340
+
341
+ # Process the deletions.
342
+ if(delete_list.size() > 0)
343
+ new_del_list = []
344
+
345
+ # As above, step 1 is to cluster the deletions.
346
+ # FIXME note that the original code behaved differently between
347
+ # insertions and deletions; confirm that this is the right
348
+ # way forward.
349
+ begin
350
+ new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors)
351
+ rescue GapMergeError
352
+ raise "Cannot frame align deletion" if raise_errors
353
+ end
354
+
355
+ # Again as above, frame-align the deletions; this time
356
+ # we don't worry about any common deletion positions.
357
+ align_gaps_to_frame(new_del_list)
358
+
359
+ # Put the deletions back into the query.
360
+ query = splice_gaps_into_sequence(query, new_del_list)
361
+ end
362
+
363
+ return [standard, query]
364
+ end
365
+
366
+ #Returns a [seq_sans_inserts, [list of inserts]]
367
+ def self.remove_inserts(elem)
368
+ return remove_insertions_from_query(elem[0], elem[1])
369
+ end
370
+
371
+ def self.remove_insertions_from_query(standard, query)
372
+ seq = '' + query
373
+ inserts = []
374
+
375
+ insert_list = []
376
+ 0.upto(standard.size() - 1) do |i|
377
+ insert_list << i if(standard[i,1] == '-')
378
+ end
379
+
380
+ big_insert_list = []
381
+ if(standard.include?('-'))#Inserts first
382
+ #First step should be to cluster inserts
383
+ cur_ins = nil
384
+ prev_i = nil
385
+ insert_list.each do |i|
386
+ if(prev_i and i == prev_i + 1)
387
+ cur_ins << i
388
+ prev_i = i
389
+ else
390
+ big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
391
+ cur_ins = [i]
392
+ prev_i = i
393
+ end
394
+ end
395
+ big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
396
+ end
397
+
398
+ offset = 0
399
+ big_insert_list.each do |ins|
400
+ ins_seq = ''
401
+ ins.each do |i|
402
+ ins_seq += query[i,1]
403
+ end
404
+ inserts << [((ins[0] - offset) / 3), ins_seq]
405
+ offset += ins.size()
406
+ ins.each do |i|
407
+ seq[i,1] = '.'
408
+ end
409
+ end
410
+
411
+ return [seq.gsub('.',''), inserts]
412
+ end
413
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cfe_gotoh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0.pre
5
+ platform: ruby
6
+ authors:
7
+ - Conan Woods
8
+ - Jamie Kai
9
+ - David Rickett
10
+ - Richard Liang
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2024-11-22 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description:
17
+ email:
18
+ executables: []
19
+ extensions:
20
+ - ext/cfe_gotoh/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - ext/cfe_gotoh/cfe_gotoh.cpp
24
+ - ext/cfe_gotoh/extconf.rb
25
+ - lib/cfe_gotoh.rb
26
+ homepage:
27
+ licenses: []
28
+ metadata:
29
+ github_repo: ssh://github.com/cfe-lab/gotoh
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 1.3.1
44
+ requirements: []
45
+ rubygems_version: 3.0.9
46
+ signing_key:
47
+ specification_version: 4
48
+ summary: CfE implementation of the Gotoh sequence alignment algorithm
49
+ test_files: []