RubyGems - cfe_gotoh - Versions diffs - 0.4.0.pre - Mend

cfe_gotoh 0.4.0.pre

Files changed (5) hide show

checksums.yaml +7 -0
data/ext/cfe_gotoh/cfe_gotoh.cpp +862 -0
data/ext/cfe_gotoh/extconf.rb +4 -0
data/lib/cfe_gotoh.rb +413 -0
metadata +49 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 69fd99e925d82fb14d1035e8461fd32d6d83895ae9bf012efea277596772e7ce
+  data.tar.gz: 92b0780c678fa34b4be4d2148df83ed1f60c9d63c15a5742f3c17fa1cb4beb24
+SHA512:
+  metadata.gz: ee778affdf1b42aca92b9acbedf6368df7b906a0b5ab6793132b126f04b781baa2a88a04254b0aa9c51a2f2dd3b09b9b0713edbba9ba85db86e26b6eaa0a5745
+  data.tar.gz: 01f949ecf25c278706fe4718b59e9ac3aa6e643d8ab3063a5fa1aba390d7f2fd3b28e7dbe1703fce94a84affd7b6573e1bbac29f117bb3442d7a6ef199fd8cbe

data/ext/cfe_gotoh/cfe_gotoh.cpp ADDED Viewed

@@ -0,0 +1,862 @@
+#include <string>
+#ifdef __PYTHON__
+    #include <Python.h>
+#else
+    #include "ruby.h"
+    #ifndef RSTRING_PTR
+        // Ruby 1.8.5 doesn't include this definition
+        #define RSTRING_PTR(s) (RSTRING(s)->ptr)
+    #endif
+#endif
+using namespace std;
+/*
+I think this application should do a complete alignment.  Unfortunately aligning is way too slow in perl,
+so I suspect the alignment, merging and possibly the gap widening should be done in c.  Another possibility
+is to call the C functions from perl, which would simplify things quite a bit!  Unfortunately, I'm not entirely
+confident in that perl can do this seamlessly(unlike nicer languages like ruby and python).
+*/
+void trim(string* seq);
+static int nucMat[127][127]; // ASCII runs from 0 to 127
+void init_pairscore(int matchscore, int mismatchPenalty)
+{
+    for (int i=0; i<127; i++)
+    {
+        for (int j=0; j<127; j++)
+        {
+            if (i==j)
+            {
+                nucMat[i][j]=matchscore;
+            }
+            else
+            {
+                nucMat[i][j]=-mismatchPenalty;
+//                if ((char)i=='N' || (char)i=='n' || (char)j=='N' || (char)j=='n')
+//                {
+//                    nucMat[i][j]=-mismatchPenalty;
+//                }
+            }
+        }
+    }
+    // adjust naive assignments for case-insensitivity
+    nucMat['a']['A']=nucMat['A']['a']=matchscore;
+    nucMat['c']['C']=nucMat['C']['c']=matchscore;
+    nucMat['g']['G']=nucMat['G']['g']=matchscore;
+    nucMat['t']['T']=nucMat['T']['t']=nucMat['u']['U']=nucMat['U']['u']=matchscore;
+    nucMat['t']['u']=nucMat['t']['U']=nucMat['T']['u']=nucMat['T']['U']=matchscore;
+    nucMat['u']['t']=nucMat['t']['T']=nucMat['U']['t']=nucMat['U']['T']=matchscore;
+    nucMat['N']['N']=nucMat['n']['N']=nucMat['N']['n']=0;
+    //bi-mixtures
+    nucMat['A']['R']=nucMat['R']['A']=matchscore;
+    nucMat['G']['R']=nucMat['R']['G']=matchscore;
+    nucMat['C']['Y']=nucMat['Y']['C']=matchscore;
+    nucMat['T']['Y']=nucMat['Y']['T']=matchscore;
+    nucMat['G']['K']=nucMat['K']['G']=matchscore;
+    nucMat['T']['K']=nucMat['K']['T']=matchscore;
+    nucMat['C']['M']=nucMat['M']['C']=matchscore;
+    nucMat['A']['M']=nucMat['M']['A']=matchscore;
+    nucMat['C']['S']=nucMat['S']['C']=matchscore;
+    nucMat['G']['S']=nucMat['S']['G']=matchscore;
+    nucMat['T']['W']=nucMat['W']['T']=matchscore;
+    nucMat['A']['W']=nucMat['W']['A']=matchscore;
+    //tri-mixtures
+    nucMat['C']['B']=nucMat['B']['C']=matchscore;
+    nucMat['G']['B']=nucMat['B']['G']=matchscore;
+    nucMat['T']['B']=nucMat['B']['T']=matchscore;
+    nucMat['A']['D']=nucMat['D']['A']=matchscore;
+    nucMat['G']['D']=nucMat['D']['G']=matchscore;
+    nucMat['T']['D']=nucMat['D']['T']=matchscore;
+    nucMat['A']['H']=nucMat['H']['A']=matchscore;
+    nucMat['C']['H']=nucMat['H']['C']=matchscore;
+    nucMat['T']['H']=nucMat['H']['T']=matchscore;
+    nucMat['A']['V']=nucMat['V']['A']=matchscore;
+    nucMat['C']['V']=nucMat['V']['C']=matchscore;
+    nucMat['G']['V']=nucMat['V']['G']=matchscore;
+   //Wild cards
+   nucMat['*']['A']=nucMat['*']['a']=nucMat['A']['*']=nucMat['a']['*']=matchscore;
+    nucMat['*']['C']=nucMat['*']['c']=nucMat['C']['*']=nucMat['c']['*']=matchscore;
+    nucMat['*']['T']=nucMat['*']['t']=nucMat['T']['*']=nucMat['t']['*']=matchscore;
+    nucMat['*']['G']=nucMat['*']['g']=nucMat['G']['*']=nucMat['g']['*']=matchscore;
+   nucMat['$']['$']=50;
+//    nucMat['$']['A']=nucMat['$']['a']=nucMat['A']['$']=nucMat['a']['$']=0;
+//    nucMat['$']['T']=nucMat['$']['t']=nucMat['T']['$']=nucMat['t']['$']=0;
+//    nucMat['$']['G']=nucMat['$']['g']=nucMat['G']['$']=nucMat['g']['$']=0;
+    //For those annoying duplicate phred values.
+    nucMat['.']['A']=nucMat['.']['a']=nucMat['A']['.']=nucMat['a']['.']=-20;
+    nucMat['.']['C']=nucMat['.']['c']=nucMat['C']['.']=nucMat['c']['.']=-20;
+    nucMat['.']['T']=nucMat['.']['t']=nucMat['T']['.']=nucMat['t']['.']=-20;
+    nucMat['.']['G']=nucMat['.']['g']=nucMat['G']['.']=nucMat['g']['.']=-20;
+    nucMat['N']['A']=nucMat['N']['a']=nucMat['A']['N']=nucMat['a']['N']=-3;
+    nucMat['N']['C']=nucMat['N']['c']=nucMat['C']['N']=nucMat['c']['N']=-3;
+    nucMat['N']['T']=nucMat['N']['t']=nucMat['T']['N']=nucMat['t']['N']=-3;
+    nucMat['N']['G']=nucMat['N']['g']=nucMat['G']['N']=nucMat['g']['N']=-3;
+    //for easy alignment to a standard with gaps
+    nucMat['X']['A']=nucMat['X']['a']=nucMat['A']['X']=nucMat['a']['X']=-6;
+    nucMat['X']['C']=nucMat['X']['c']=nucMat['C']['X']=nucMat['c']['X']=-6;
+    nucMat['X']['T']=nucMat['X']['t']=nucMat['T']['X']=nucMat['t']['X']=-6;
+    nucMat['X']['G']=nucMat['X']['g']=nucMat['G']['X']=nucMat['g']['X']=-6;
+    nucMat['X']['R']=nucMat['X']['r']=nucMat['R']['X']=nucMat['r']['X']=-6;
+    nucMat['X']['Y']=nucMat['X']['y']=nucMat['Y']['X']=nucMat['y']['X']=-6;
+    nucMat['X']['K']=nucMat['X']['k']=nucMat['K']['X']=nucMat['k']['X']=-6;
+    nucMat['X']['M']=nucMat['X']['m']=nucMat['M']['X']=nucMat['m']['X']=-6;
+    nucMat['X']['S']=nucMat['X']['s']=nucMat['S']['X']=nucMat['s']['X']=-6;
+    nucMat['X']['W']=nucMat['X']['w']=nucMat['W']['X']=nucMat['w']['X']=-6;
+    nucMat['X']['B']=nucMat['X']['b']=nucMat['B']['X']=nucMat['b']['X']=-6;
+    nucMat['X']['D']=nucMat['X']['d']=nucMat['D']['X']=nucMat['d']['X']=-6;
+    nucMat['X']['H']=nucMat['X']['h']=nucMat['H']['X']=nucMat['h']['X']=-6;
+    nucMat['X']['V']=nucMat['X']['v']=nucMat['V']['X']=nucMat['v']['X']=-6;
+    nucMat['X']['-']=nucMat['X']['-']=3;
+}
+void init_pairscore_aa(int matchscore, int mismatchPenalty)
+{
+    for (int i=0; i<127; i++)
+    {
+        for (int j=0; j<127; j++)
+        {
+            if(i==j)
+            {
+                nucMat[i][j]=matchscore;
+            }
+            else
+            {
+                nucMat[i][j]=-mismatchPenalty;
+               if((char)i=='X' || (char)j=='X')
+               {
+                   nucMat[i][j]=-4;
+               }
+            }
+        }
+    }
+    nucMat['Z']['Z']=nucMat['z']['Z']=nucMat['Z']['z']=0;
+   nucMat['X']['-']=nucMat['-']['X']=matchscore;
+}
+/*
+    Empirical score matrix based on 25% divergent HIV sequences
+    See Nickle, David C., et al. "HIV-specific probabilistic models of protein evolution."
+        PLoS One 2.6 (2007): e503.
+*/
+static int empirical_hiv25[24][24] = {\
+{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},\
+{-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},\
+{-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},\
+{-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},\
+{-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},\
+{-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},\
+{-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},\
+{-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},\
+{-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},\
+{-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},\
+{-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},\
+{-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},\
+{-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},\
+{-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},\
+{-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},\
+{-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},\
+{1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},\
+{-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},\
+{-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},\
+{0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},\
+{-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},\
+{-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},\
+{-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},\
+{-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
+void init_pairscore_hiv25(void) {
+    // ASCII codes for protein alphabet ARNDCQEGHILKMFPSTWYVBZ?*
+    int aa_to_ascii[24] = { 65, 82, 78, 68, 67, 81, 69, 71, 72, 73, 76, 75, 77, 70, 80, 83, 84,
+    87, 89, 86, 66, 90, 63, 42 };
+    int i2, j2;
+    // reset score matrix to be safe
+    for (int i=0; i<127; i++) {
+        for (int j=0; j<127; j++) {
+            nucMat[i][j] = 0;
+        }
+    }
+    // map HIV 25% empirical matrix to score matrix
+    for (int i=0; i<24; i++) {
+        i2 = aa_to_ascii[i];
+        for (int j=0; j<24; j++) {
+            j2 = aa_to_ascii[j];
+            // also map to lowercase
+            nucMat[i2+32][j2+32] = nucMat[i2+32][j2] = nucMat[i2][j2+32] = nucMat[i2][j2] = empirical_hiv25[i][j];
+        }
+    }
+}
+extern int pairscore(char a, char b)
+{
+    return nucMat[a][b];
+}
+void reverse(string* seq)
+{
+    string tmp = "";
+    for(int i = seq->size() - 1; i >= 0; --i)
+    {
+        tmp += (*seq)[i];
+    }
+    *seq = tmp;
+}
+//Error must be somewhere in here.  Ug...
+int align(string* seqa, string* seqb, string* newseqa, string* newseqb,
+          int gip, int gep, int use_terminal_gap_penalty)
+{
+    /*
+     Pairwise alignment with affine gap penalty.
+     see Gotoh, Osamu. "Optimal alignment between groups of sequences and its application
+     to multiple sequence alignment." Computer applications in the biosciences: CABIOS 9.3
+     (1993): 361-370.
+     Gap open and extension penalties [gip] and [gep] are assumed to take positive values.
+    */
+    int M = seqa->size(); // first group of pre-aligned sequences
+    int N = seqb->size(); // second group
+    // if empty ref, return seqb as-is, and seqa as gaps of size(seqb)
+    // prevents a buffer overflow in the traceback matrices which assume M>0
+    if (M==0)
+    {
+        int j;
+        int alignment_score=0;
+        for (j=0 ; j < N ; j++)
+        {
+            //skip terminal (whole seq) gap penalties if user specifies this option
+            if (use_terminal_gap_penalty==0) alignment_score += (j==0) ? (gip+gep) : gep ;
+            *newseqa += '-';
+            *newseqb += (*seqb)[j];;
+        }
+        return alignment_score;
+    }
+    int i, j;
+    // not all elements of D, P, and Q need to be stored - vectors are adequate
+    int *SS=new int[N+1];  // D(i, .)
+    int *oldSS=new int[N+1];  // D(i-1, .)
+    int *PP = new int[N+1];  // P(i, .)
+    // Gotoh traceback matrices
+    int **piSS = new int*[M+1];
+    int **pjSS = new int*[M+1];
+    int u = -gip; // affine gap initiation penalty
+    int v = -gep; // affine gap extension penalty
+    int w1 = u + v; // gap weight w_k = v * k + u for k = 1
+    int t = u;
+    int s, q;
+    // initialize vectors
+    for (j=0; j<N+1; j++)
+    {
+        SS[j]=0;
+        oldSS[j]=0;
+        PP[j]=0;
+    }
+    // initialize traceback matrices
+    piSS[0] = new int[N+1];
+    pjSS[0] = new int[N+1];
+    piSS[1] = new int[N+1];
+    pjSS[1] = new int[N+1];
+    piSS[1][0] = 0;
+    pjSS[1][0] = 0;
+    piSS[0][1] = 0;
+    pjSS[0][1] = 0;
+    int maxiS = -100000;
+    int maxjS = -100000;
+    int maxij, maxji;
+    for (i=1; i < M+1; i++)
+    {
+        t += v; // update gap extension
+        s = t;
+        SS[0]=0;
+        q = t + u;
+        // add new rows
+        if (i>1)
+        {
+            piSS[i] = new int[N+1];
+            pjSS[i] = new int[N+1];
+        }
+        for (j = 1; j < N + 1; j++)
+        {
+            // recursive calculation of Q
+            if (q >= s + u )
+                q += v; // extension
+            else
+                q = s + u + v; // open
+            // recursive calculation of P
+            if ((oldSS[j] + w1) > (PP[j] + v))
+                PP[j] = oldSS[j] + w1;
+            else
+                PP[j] += v;
+            int tmp_pp = PP[j];
+            // D(i-1, j-1) + d(a_i, b_j)
+            int pscore = oldSS[j - 1] + pairscore((*seqa)[i - 1], (*seqb)[j - 1]);
+            //no idea if this will work, but its supposed to be a stop codon aligner
+            //the bonus is  assigned on the last codon, if the codons between don't make a big difference it'll be wrong.  Hrm.
+            if(i >= 3 && j >= 3 && (*seqa)[i-3] == '$' && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$'  &&
+               (((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'G') ||
+               ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'A' && (*seqb)[j-1] == 'A') ||
+               ((*seqb)[j-3] == 'T' && (*seqb)[j-2] == 'G' && (*seqb)[j-1] == 'A') ))
+            {
+               pscore += 6;
+            }
+            if(i >= 3 && j >= 3 && (*seqa)[i-2] == '$' && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$'  &&
+               (((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'G') ||
+               ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'A' && (*seqb)[j-0] == 'A') ||
+               ((*seqb)[j-2] == 'T' && (*seqb)[j-1] == 'G' && (*seqb)[j-0] == 'A') ))
+            {
+               pscore += 6;
+            }
+            if(i >= 3 && j >= 3 && (*seqa)[i-1] == '$' && (*seqa)[i-0] == '$' && (*seqa)[i+1] == '$'  &&
+               (((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'G') ||
+               ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'A' && (*seqb)[j+1] == 'A') ||
+               ((*seqb)[j-1] == 'T' && (*seqb)[j-0] == 'G' && (*seqb)[j+1] == 'A') ))
+            {
+               pscore += 6;
+            }
+            /*
+             D(i,j) = Min { D(i-1, j-1) + d(a_i, b_j), P(i,j), Q(i,j) }
+             where P(i,j) = Min { D(i-k, j) + w_k } for k = 1, .., i
+               and Q(i,j) = Min { D(i, j-k) + w_k } for k = 1, ..., j
+             i.e., three options are:
+              1. match/mismatch,
+              2. gap open/extension in sequence (a),
+              3. gap open/extension in sequence (b)
+             pscore = D(i-1, j-1) + d(a_i, b_j)
+             tmp_pp = P(i,j)
+             q = Q(i,j)
+            */
+            //maybe just >?
+            if (tmp_pp >= pscore)
+            {
+                if (tmp_pp > q)
+                {
+                    // gap open / extension in (a)
+                    s = tmp_pp;
+                    piSS[i][j] = i - 1;
+                    pjSS[i][j] = j;
+                }
+                else // q > tmp_pp > pscore
+                {
+                    // gap open / extension in (b)
+                    s = q;
+                    piSS[i][j] = i;
+                    pjSS[i][j] = j - 1;
+                }
+            }
+            else // pscore > tmp_pp)
+            {
+                if (pscore > q)
+                {
+                    // match / mismatch
+                    s = pscore;
+                    piSS[i][j] = i - 1;
+                    pjSS[i][j] = j - 1;
+                }
+                else // q > pscore > tmp_pp
+                {
+                    // gap open / extension in (b)
+                    s = q;
+                    piSS[i][j] = i;
+                    pjSS[i][j] = j - 1;
+                }
+            }
+            SS[j] = s;
+            if (i == M && SS[j] >= maxiS)
+            {
+                maxiS = SS[j];
+                maxij = j;
+            }
+        }
+        if (SS[N] >= maxjS)
+        {
+            maxjS = SS[N];
+            maxji = i;
+        }
+        for (j = 0; j < N + 1; j++)
+        {
+            oldSS[j] = SS[j];
+        }
+    }
+    if (maxij>N)
+        maxij=N;
+    if (maxji>M)
+        maxji=M;
+    if (maxij<0)
+        maxij=0;
+    if (maxji<0)
+        maxji=0;
+    //add starting -'s
+    int alignment_score;
+    if (maxiS > maxjS)
+    {
+        alignment_score = maxiS;
+        i = M;
+        j = maxij;
+        for (int kk = N; kk > maxij; kk--)
+        {
+            *newseqb += (*seqb)[kk - 1];
+            *newseqa += '-';
+        }
+    }
+   else
+    {
+        alignment_score = maxjS;
+        i = maxji;
+        j = N;
+        for (int kk = M; kk > maxji; kk--)
+        {
+            *newseqa += (*seqa)[kk - 1];
+            *newseqb += '-';
+        }
+    }
+    bool decI = false;
+    bool decJ = false;
+    //inserting -'s in the middle!
+    while(i >= 1 && j >= 1)
+    {
+        decI=false;
+        decJ=false;
+        if (piSS[i][j] < i)
+        {
+            *newseqa += (*seqa)[i - 1];
+            decI = true;
+        }
+        else
+        {
+            *newseqa += '-';
+        }
+        if (pjSS[i][j] < j)
+        {
+            *newseqb += (*seqb)[j - 1];
+            decJ=true;
+        }
+        else
+        {
+            *newseqb += '-';
+        }
+        if (decI)
+        {
+            i--;
+        }
+        if (decJ)
+        {
+            j--;
+        }
+    }
+    //add extra trailing -'s
+    //forgive terminal gap penalties if user specifies this option
+    if (i < j)
+    {
+        for (int jj = j; jj >= 1; jj--)
+        {
+            *newseqb += (*seqb)[jj - 1];
+            *newseqa += '-';
+            if (use_terminal_gap_penalty==0) alignment_score += gep;
+        }
+        if (use_terminal_gap_penalty==0) alignment_score += gip;
+    }
+    else if(i > j)
+    {
+        for (int ii = i; ii >= 1; ii--)
+        {
+            *newseqa += (*seqa)[ii - 1];
+            *newseqb += '-';
+            if (use_terminal_gap_penalty==0) alignment_score += gep;
+        }
+        if (use_terminal_gap_penalty==0) alignment_score += gip;
+    }
+    reverse(newseqa);
+    reverse(newseqb);
+    for (i = 0; i < M + 1; i++)
+    {
+        delete []piSS[i];
+        delete []pjSS[i];
+    }
+    delete []SS;
+    delete []oldSS;
+    delete []piSS;
+    delete []pjSS;
+    delete []PP;
+    return alignment_score;
+}
+void degap(string* seq)
+{
+    /*
+    Remove pre-existing gap characters from sequences prior to alignment.
+    */
+    unsigned int pos = 0;
+    while(pos != -1)
+    {
+        pos = seq->find('-', 0);
+        if(pos != -1)
+        {
+            seq->erase(pos, 1);
+        }
+    }
+}
+void trim(string* seq)
+{
+    /*
+    Remove trailing whitespace from sequences.
+    */
+    while((*seq)[0] == ' ' || (*seq)[0] == '\t' || (*seq)[0] == '\n' || (*seq)[0] == '\r')
+    {
+        seq->erase(0, 1);
+    }
+    while((*seq)[seq->size() - 1] == ' ' || (*seq)[seq->size() - 1] == '\t' || (*seq)[seq->size() - 1] == '\n'  || (*seq)[seq->size() - 1] == '\r')
+    {
+        seq->erase(seq->size() - 1, 1);
+    }
+}
+void widen_gaps(string* seq)
+{
+    int size = seq->size();
+    for(int i = 0; i < size; i++)
+    {
+        if((*seq)[i] == '-')
+        { //start searching for gaps to cluster
+            //backwards, seqa
+            unsigned int j = i - 1;
+            int letter = (*seq)[j];
+            j--;
+            while(j >= 0)
+            {
+                if((*seq)[j] == '-')
+                {
+                    //woo, swap this with i - 1
+                    (*seq)[j] = letter;
+                    (*seq)[i - 1] = '-';
+                    break;
+                }
+                else if((*seq)[j] == letter)
+                {
+                    //nothing really
+                }
+                else if((*seq)[j] != letter)
+                {
+                    break;
+                }
+                j--;
+            }
+            //forward, seqa
+            j = i + 1;
+            letter = (*seq)[j];
+            j++;
+            while(j < seq->size())
+            {
+                if((*seq)[j] == '-')
+                {
+                    //woo, swap this with i + 1
+                    (*seq)[j] = letter;
+                    (*seq)[i + 1] = '-';
+                    break;
+                }
+                else if((*seq)[j] == letter)
+                {
+                    //nothing really
+                }
+                else if((*seq)[j] != letter)
+                {
+                    break;
+                }
+                j++;
+            }
+        }
+    }
+}
+#ifdef __PYTHON__
+    /* Python wrapper functions */
+    static PyObject * align_it(PyObject * self, PyObject * args)
+    {
+        const char * standard;
+        const char * seq;
+        int gap_init_penalty;
+        int gap_extend_penalty;
+        int use_terminal_gap_penalty;
+        int score;
+        if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
+            return NULL;
+        }
+        init_pairscore(5, 4); // match, mismatch scores +5, -4 respectively (HyPhy defaults)
+        string* seqa = new string(standard);
+        string* seqb = new string(seq);
+        trim(seqa);
+        trim(seqb);
+        //degap(seqa);
+        //degap(seqb);
+        string* newseqa = new string();
+        string* newseqb = new string();
+        score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
+        PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
+        delete seqa;
+        delete seqb;
+        delete newseqa;
+        delete newseqb;
+        return retval;
+    }
+    static PyObject * align_it_rb(PyObject * self, PyObject * args)
+    {
+        // emulate Ruby implementation of align_it
+        const char * standard;
+        const char * seq;
+        int gap_init_penalty;
+        int gap_extend_penalty;
+        if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
+            return NULL;
+        }
+        init_pairscore(1, 1);
+        string* seqa = new string(standard);  // reference
+        string* seqb = new string(seq);  // query
+        trim(seqa);
+        trim(seqb);
+        degap(seqa);
+        degap(seqb);
+        string* newseqa = new string();
+        string* newseqb = new string();
+        align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
+        PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
+        delete seqa;
+        delete seqb;
+        delete newseqa;
+        delete newseqb;
+        return retval;
+    }
+    static PyObject * align_it_aa(PyObject * self, PyObject * args)
+    {
+        const char * standard;
+        const char * seq;
+        int gap_init_penalty;
+        int gap_extend_penalty;
+        int use_terminal_gap_penalty;
+        int score;
+        if (!PyArg_ParseTuple(args, "ssiii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty, &use_terminal_gap_penalty)) {
+            return NULL;
+        }
+        init_pairscore_hiv25();
+        string* seqa = new string(standard);  // reference
+        string* seqb = new string(seq);  // query
+        trim(seqa);
+        trim(seqb);
+        //degap(seqa);  // HyPhy behaviour is to not remove gaps
+        //degap(seqb);
+        string* newseqa = new string();
+        string* newseqb = new string();
+        score = align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, use_terminal_gap_penalty);
+        PyObject * retval = Py_BuildValue("ssi", newseqa->c_str(), newseqb->c_str(), score);
+        delete seqa;
+        delete seqb;
+        delete newseqa;
+        delete newseqb;
+        return retval;
+    }
+    static PyObject * align_it_aa_rb(PyObject * self, PyObject * args)
+    {
+        // emulate Ruby implementation of align_it_aa
+        const char * standard;
+        const char * seq;
+        int gap_init_penalty;
+        int gap_extend_penalty;
+        if (!PyArg_ParseTuple(args, "ssii", &standard, &seq, &gap_init_penalty, &gap_extend_penalty)) {
+            return NULL;
+        }
+        init_pairscore_aa(4, -2);
+        string* seqa = new string(standard);  // reference
+        string* seqb = new string(seq);  // query
+        trim(seqa);
+        trim(seqb);
+        degap(seqa);
+        degap(seqb);
+        string* newseqa = new string();
+        string* newseqb = new string();
+        align(seqa, seqb, newseqa, newseqb, gap_init_penalty, gap_extend_penalty, 0);
+        PyObject * retval = Py_BuildValue("ss", newseqa->c_str(), newseqb->c_str());
+        delete seqa;
+        delete seqb;
+        delete newseqa;
+        delete newseqb;
+        return retval;
+    }
+    static PyMethodDef AlignmentMethods [] =
+    {
+        {"align_it", align_it, METH_VARARGS, "Pairwise alignment of nucleotide sequences."},
+        {"align_it_rb", align_it_rb, METH_VARARGS, "Pairwise alignment of nucleotide sequences using ReCall settings."},
+        {"align_it_aa", align_it_aa, METH_VARARGS, "Pairwise alignment of protein sequences using empirical HIV 25% score matrix."},
+        {"align_it_aa_rb", align_it_aa_rb, METH_VARARGS, "Pairwise alignment of protein sequences using ReCall settings."},
+        {NULL, NULL, 0, NULL}
+    };
+    static struct PyModuleDef AlignmentModuleDef = {
+        PyModuleDef_HEAD_INIT,
+        "gotoh",
+        NULL,
+        -1,
+        AlignmentMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+    };
+    PyMODINIT_FUNC PyInit_gotoh(void) {
+        return PyModule_Create(&AlignmentModuleDef);
+    }
+#else
+    /* Ruby wrapper functions */
+    extern "C" VALUE align_it(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
+    {
+       init_pairscore(1, 1);
+       string* seqa = new string(RSTRING_PTR(standard));
+       string* seqb = new string(RSTRING_PTR(seq));
+       trim(seqa);
+        trim(seqb);
+       degap(seqa);
+        degap(seqb);
+       string* newseqa = new string();
+       string* newseqb = new string();
+       align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
+       VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
+       delete seqa;
+       delete seqb;
+       delete newseqa;
+       delete newseqb;
+       return ret;
+    }
+    extern "C" VALUE align_it_aa(VALUE self, VALUE standard, VALUE seq, VALUE gap_init_penalty, VALUE gap_extend_penalty)
+    {
+       init_pairscore_aa(4, -2);
+       string* seqa = new string(RSTRING_PTR(standard));
+       string* seqb = new string(RSTRING_PTR(seq));
+       trim(seqa);
+        trim(seqb);
+       degap(seqa);
+        degap(seqb);
+       string* newseqa = new string();
+       string* newseqb = new string();
+       align(seqa, seqb, newseqa, newseqb, NUM2INT(gap_init_penalty), NUM2INT(gap_extend_penalty), 0);
+       VALUE ret = rb_ary_new3(2, rb_str_new2(newseqa->c_str()),rb_str_new2(newseqb->c_str()));
+       delete seqa;
+       delete seqb;
+       delete newseqa;
+       delete newseqb;
+       return ret;
+    }
+    extern "C" void Init_cfe_gotoh()
+    {
+        VALUE gotoh = rb_define_module("CfeGotoh");
+        rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4);
+        rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4);
+    }
+#endif

data/ext/cfe_gotoh/extconf.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require "mkmf"
+create_header
+create_makefile('cfe_gotoh/cfe_gotoh')

data/lib/cfe_gotoh.rb ADDED Viewed

@@ -0,0 +1,413 @@
+#TODO:  Scoring algorithm to improve frame_align?
+require_relative 'cfe_gotoh/cfe_gotoh'
+module CfeGotoh
+  class Error < RuntimeError
+  end
+  class GapMergeError < Error
+  end
+  sub_matrix = Array.new(127) {Array.new(127) {-1.0} }
+  ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc|
+    sub_matrix[nuc.ord()][nuc.ord()] = 1.0
+    sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N')
+  end
+  #bi-mixtures
+  sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0
+  sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0
+  sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0
+  sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0
+  sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0
+  sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0
+  sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0
+  sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0
+  sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0
+  sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0
+  sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0
+  sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0
+  #tri-mixtures
+  sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0
+  sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0
+  sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0
+  sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0
+  sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0
+  sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0
+  sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0
+  sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0
+  sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0
+  sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0
+  sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0
+  sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0
+  #other
+  sub_matrix['$'.ord()]['$'.ord()]=50.0
+  sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0
+  sub_matrix['N'.ord()]['N'.ord()] = 0.0
+  sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0
+  ['A','T','G','C'].each do |ch|
+    sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0
+    sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7
+    sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0
+    sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0
+    sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0
+  end
+  sub_matrix.each {|column| column.freeze}
+  sub_matrix.freeze
+  NUCLEOTIDE_MATRIX = sub_matrix
+  def self.score_alignment(standard, query)
+    sc = 0.0
+    0.upto(standard.size() - 1) do |i|
+      sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()]
+    end
+    return sc
+  end
+  def self.make_gap_list(seq)
+    list = []
+    cur_ins = nil
+    prev_i = nil
+    0.upto(seq.size() - 1) do |i|
+      if(seq[i,1] == '-')
+        if(prev_i and i == prev_i + 1)
+          cur_ins << i
+          prev_i = i
+        else
+          list << cur_ins if(cur_ins != nil and cur_ins != [])
+          cur_ins = [i]
+          prev_i = i
+        end
+      end
+    end
+    list << cur_ins if(cur_ins != nil and cur_ins != [])
+    return list
+  end
+  def self.trim_leading_dashes(standard, query)
+    leading_dashes_match = /^(-+)[^-]/.match(standard)
+    if (leading_dashes_match.nil?)
+      return
+    end
+    leading_dashes = leading_dashes_match[1]
+    standard[0, leading_dashes.size()] = ''
+    query[0, leading_dashes.size()] = ''
+  end
+  def self.trim_trailing_dashes(standard, query)
+    trailing_dashes_match = /[^-](-+)$/.match(standard)
+    if (trailing_dashes_match.nil?)
+      return
+    end
+    trailing_dashes = trailing_dashes_match[1]
+    end_of_standard = standard.size() - trailing_dashes.size()
+    standard[end_of_standard, trailing_dashes.size()] = ''
+    query[end_of_standard, trailing_dashes.size()] = ''
+  end
+  def self.fix_incomplete_edge_codon(query, side=:leading)
+    edge_idx = 0
+    dash_regex = /^(-+)[^-]/
+    incr = 1
+    if (side != :leading)  # fix the trailing edge
+      edge_idx = -1
+      dash_regex = /[^-](-+)$/
+      incr = -1
+    end
+    if (query[edge_idx] == '-')
+      dashes = dash_regex.match(query)[1]  # we know there will be a match
+      # If the length of the dashes aren't a multiple of 3, turn some
+      # of the query characters into dashes to force it to be a full
+      # codon of dashes.
+      if (dashes.size() % 3 >= 1)
+        first_non_dash_idx = dashes.size()
+        if (side != :leading)
+          first_non_dash_idx = query.size() - dashes.size() - 1
+        end
+        query[first_non_dash_idx] = '-'
+        if (dashes.size() % 3 == 1)
+          query[first_non_dash_idx + incr] = '-'
+        end
+      end
+    end
+  end
+  def self.merge_insertions_and_deletions_to_fix_oof_sequences(
+    standard,
+    query
+  )
+    # Merge deletions and insertions until the sequences have a cogent length
+    # (i.e. have length divisible by 3).  This helps fix poor insertions near
+    # the start of the sequence.
+    raise 'Standard and query should be the same length' if standard.size() != query.size()
+    if(standard.size() % 3 != 0)
+      dex = 0
+      while(dex = standard.index(/-/, dex))
+        [-1, 1, -2, 2].each do |offset|  # look one base away, then two bases away
+          if ((dex + offset >= 0) and query[dex + offset] == '-')
+            standard[dex] = ''
+            query[dex + offset] = ''
+            dex = 0
+            break
+          end
+        end
+        # Stop if the sequences are now a cogent length.
+        if(standard.size() % 3 == 0)
+          break
+        end
+        dex += 1
+      end
+    end
+  end
+  def self.cluster_gaps(gaps, raise_errors=false)
+    # Merge adjacent gaps if they are not a codon-sized gap.
+    new_gap_list = []
+    gaps.each_with_index do |gap, i|
+      next if(gap.size() == 0)  # we already ate this one
+      if(gap.size() % 3 == 0)  # this gap is fine!
+        new_gap_list << gap
+        next
+      end
+      gap2 = gaps[i + 1]  # note: these could be nil, which we test for below
+      gap3 = gaps[i + 2]
+      # Can I merge with the next gap?
+      if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9)
+        if(gap2.size() > gap.size())
+          new_gap_list <<  ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2
+        else
+          new_gap_list <<  gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a()
+        end
+        gaps[i + 1] = []
+      # Can I merge with the next two gaps?
+      elsif(
+        gap2 and gap3 and
+        (gap + gap2 + gap3).size() % 3 == 0 and
+        (gap3.first - gap.last) < 12
+      )
+        # Place the gap around the middle of the three merging gaps.
+        new_gap = (
+          ((gap2.first - gap.size()) .. gap2.first - 1).to_a() +
+          gap2 +
+          ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a()
+        )
+        new_gap_list << new_gap
+        gaps[i + 1] = []
+        gaps[i + 2] = []
+      else
+        # We can't merge the gaps; either raise an error or meekly proceed.
+        if (raise_errors)
+          raise GapMergeError
+        else
+          new_gap_list << gap  # FIXME this behaviour differs between insertions and deletions
+        end
+      end
+    end
+    return new_gap_list
+  end
+  def self.align_gaps_to_frame(gaps, common_gap_locations=nil)
+    # Align gaps to codon boundaries, giving preference to common
+    # gap locations if specified.
+    # Gaps must be listed in ascending order, i.e. from left to right.
+    offset = 0  # offset created by previous gaps.
+    gaps.each do |gap|
+      # See if this gap is close to a common gap location (within 3 amino acids).
+      if (!common_gap_locations.nil?)
+        closest_common = common_gap_locations.min() do |a, b|
+          (3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs()
+        end
+        if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9)
+          # Align the gap to this position.
+          new_gap = []
+          0.upto(gap.size() - 1) do |i|
+            new_gap << 3 * closest_common + i + offset
+          end
+          gap.replace(new_gap)
+        end
+      end
+      # Align the gap to the nearest appropriate frame.
+      # Original comment from Conan: scoring would be good here
+      if(gap[0] % 3 == 1)  # set back one base
+        new_gap = []
+        gap.each do |i|
+          new_gap << i - 1
+        end
+        gap.replace(new_gap)
+      elsif(gap[0] % 3 == 2)  # set forward one base
+        new_gap = []
+        gap.each do |i|
+          new_gap << i + 1
+        end
+        gap.replace(new_gap)
+      end
+      offset += gap.size()
+    end
+    return gaps
+  end
+  def self.splice_gaps_into_sequence(seq, gaps)
+    # Place the specified gaps into the sequence.  Note that the
+    # gaps are specified by their positions in an *aligned* sequence,
+    # and as such include "offsets" introduced by gaps placed earlier
+    # in the sequence.  The gaps must be in left-to-right order.
+    seq = seq.gsub('-','')
+    gaps.each do |gap|
+      gap.each do |i|
+        if(i > seq.size())
+          seq.insert(-1, '-')
+        else
+          seq.insert(i, '-')
+        end
+      end
+    end
+    return seq
+  end
+  #common_insert_locations is based on amino acid locations starting at base 0.
+  #Assumes standard in the first base.
+  #Prealign lets you run a lot of the corrections and qc on a already aligned sequence.
+  def self.frame_align(
+    standard,
+    query,
+    gap_init=3,
+    gap_penalty=1,
+    common_insert_locations=nil,
+    trim=false,
+    raise_errors=false,
+    prealigned=false
+  )
+    if (common_insert_locations.nil?)
+      common_insert_locations = []
+    end
+    if(!prealigned)
+      elem = align_it(standard, query, gap_init, gap_penalty)
+      standard = elem[0]
+      query = elem[1]
+    end
+    raise "Standard and query should be the same length" if standard.size() != query.size()
+    # Trim leading and trailing dashes if desired.
+    if (trim)
+      trim_leading_dashes(standard, query)
+      trim_trailing_dashes(standard, query)
+      fix_incomplete_edge_codon(query, :leading)
+      fix_incomplete_edge_codon(query, :trailing)
+    end
+    merge_insertions_and_deletions_to_fix_oof_sequences(standard, query)
+    if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
+      raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3"
+    end
+    if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors)
+      raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3"
+    end
+    # Build the insert/delete lists.  These lists look like
+    # [[3,4,5], [9], [11,12]]
+    insert_list = make_gap_list(standard)
+    delete_list = make_gap_list(query)
+    # Process the insertions.
+    if(insert_list.size() > 0)
+      new_ins_list = []
+      # Step 1: cluster the insertions.
+      begin
+        new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)
+      rescue GapMergeError
+        raise "Cannot frame align insert" if raise_errors
+      end
+      # Step 2: frame-align the insertions, shifting things to common insertion
+      # positions where appropriate.
+      align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations)
+      # Put the insertions back into the standard.
+      standard = splice_gaps_into_sequence(standard, new_ins_list)
+    end
+    # Process the deletions.
+    if(delete_list.size() > 0)
+      new_del_list = []
+      # As above, step 1 is to cluster the deletions.
+      # FIXME note that the original code behaved differently between
+      # insertions and deletions; confirm that this is the right
+      # way forward.
+      begin
+        new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors)
+      rescue GapMergeError
+        raise "Cannot frame align deletion" if raise_errors
+      end
+      # Again as above, frame-align the deletions; this time
+      # we don't worry about any common deletion positions.
+      align_gaps_to_frame(new_del_list)
+      # Put the deletions back into the query.
+      query = splice_gaps_into_sequence(query, new_del_list)
+    end
+    return [standard, query]
+  end
+  #Returns a [seq_sans_inserts, [list of inserts]]
+  def self.remove_inserts(elem)
+    return remove_insertions_from_query(elem[0], elem[1])
+  end
+  def self.remove_insertions_from_query(standard, query)
+    seq = '' + query
+    inserts = []
+    insert_list = []
+    0.upto(standard.size() - 1) do |i|
+      insert_list << i if(standard[i,1] == '-')
+    end
+    big_insert_list = []
+    if(standard.include?('-'))#Inserts first
+      #First step should be to cluster inserts
+      cur_ins = nil
+      prev_i = nil
+      insert_list.each do |i|
+        if(prev_i and i == prev_i + 1)
+          cur_ins << i
+          prev_i = i
+        else
+          big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
+          cur_ins = [i]
+          prev_i = i
+        end
+      end
+      big_insert_list << cur_ins if(cur_ins != nil and cur_ins != [])
+    end
+    offset = 0
+    big_insert_list.each do |ins|
+      ins_seq = ''
+      ins.each do |i|
+        ins_seq += query[i,1]
+      end
+      inserts << [((ins[0] - offset) / 3), ins_seq]
+      offset += ins.size()
+      ins.each do |i|
+        seq[i,1] = '.'
+      end
+    end
+    return [seq.gsub('.',''), inserts]
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: cfe_gotoh
+version: !ruby/object:Gem::Version
+  version: 0.4.0.pre
+platform: ruby
+authors:
+- Conan Woods
+- Jamie Kai
+- David Rickett
+- Richard Liang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2024-11-22 00:00:00.000000000 Z
+dependencies: []
+description:
+email:
+executables: []
+extensions:
+- ext/cfe_gotoh/extconf.rb
+extra_rdoc_files: []
+files:
+- ext/cfe_gotoh/cfe_gotoh.cpp
+- ext/cfe_gotoh/extconf.rb
+- lib/cfe_gotoh.rb
+homepage:
+licenses: []
+metadata:
+  github_repo: ssh://github.com/cfe-lab/gotoh
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 1.3.1
+requirements: []
+rubygems_version: 3.0.9
+signing_key:
+specification_version: 4
+summary: CfE implementation of the Gotoh sequence alignment algorithm
+test_files: []