bacterial-annotator 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: befd57ce78f0c186da1553c7372c3aa6faeb9d90
         | 
| 4 | 
            +
              data.tar.gz: 5e37d6a7e579a1e9e428deb9864e4a9d5ea9f057
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: a9a9766113cef56ae7ed35749cd5fbc10d746aa82e403596dccd0c5e7946786b136a69e19c74a4cece73549b0f1a8de077a8c106fc0e2310f7b000dc6cbad962
         | 
| 7 | 
            +
              data.tar.gz: 00a0c5cf815252fa45ffae194318f730cf69e27dd53643659fb06d2dac131a3de881cbad2595b43df6ba5014be75cef8b337e6710afb46e42b420fdd1cf9b178
         | 
    
        data/bin/ba_prodigal
    CHANGED
    
    
    
        data/bin/bacterial-annotator
    CHANGED
    
    | @@ -46,22 +46,21 @@ annotate [OPTIONS] | |
| 46 46 | 
             
                --force/-f		Force to overwrite the output directory
         | 
| 47 47 |  | 
| 48 48 | 
             
              // Dataset
         | 
| 49 | 
            -
                --refgenome/-g | 
| 50 | 
            -
                --guessref | 
| 49 | 
            +
                --refgenome/-g	<GBK_ID> 	Provide a Genbank file or a Gbk Accession ID.
         | 
| 50 | 
            +
                --guessref	 	Will guess the best reference genome to use for the annotation.
         | 
| 51 51 |  | 
| 52 | 
            -
                -- | 
| 53 | 
            -
            			   | 
| 54 | 
            -
            			  Can be very slow, better to use an external database !
         | 
| 55 | 
            -
             | 
| 56 | 
            -
                --externaldb		<proteins fasta_file>
         | 
| 57 | 
            -
            			  Complete or do the annotation of remaining CDS with this database (a protein fasta file).
         | 
| 52 | 
            +
                --externaldb	<proteins fasta_file>
         | 
| 53 | 
            +
            			  Finish or do a complete annotation with this sequence database (a protein fasta file).
         | 
| 58 54 | 
             
            			  Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
         | 
| 59 55 | 
             
            			  >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
         | 
| 60 56 | 
             
            			  >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
         | 
| 61 57 |  | 
| 62 58 | 
             
              // Other options
         | 
| 63 | 
            -
                --pidentity		Minimum percentage identity to incorporate a CDS annotation [default=0.7]
         | 
| 64 | 
            -
                -- | 
| 59 | 
            +
                --pidentity		<% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.7]
         | 
| 60 | 
            +
                --pcoverage		<% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.7]
         | 
| 61 | 
            +
            			             .. otherwise hint for a non-functional protein
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                --minlength		<length> Minimum contig length for annotation [default=500]
         | 
| 65 64 |  | 
| 66 65 | 
             
                --meta		Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
         | 
| 67 66 |  | 
| @@ -77,6 +76,7 @@ def parseOptions_annotate | |
| 77 76 | 
             
              # default options
         | 
| 78 77 | 
             
              options[:outdir] = "BAnnotation"
         | 
| 79 78 | 
             
              options[:pidentity] = 70
         | 
| 79 | 
            +
              options[:pcoverage] = 70
         | 
| 80 80 | 
             
              options[:minlength] = 500
         | 
| 81 81 | 
             
              options[:meta] = 0
         | 
| 82 82 |  | 
| @@ -95,10 +95,10 @@ def parseOptions_annotate | |
| 95 95 | 
             
                  options[:minlength] = ARGV.shift
         | 
| 96 96 | 
             
                when "--pidentity"
         | 
| 97 97 | 
             
                  options[:pidentity] = ARGV.shift
         | 
| 98 | 
            +
                when "--pcoverage"
         | 
| 99 | 
            +
                  options[:pcoverage] = ARGV.shift
         | 
| 98 100 | 
             
                when "--meta"
         | 
| 99 101 | 
             
                  options[:meta] = 1
         | 
| 100 | 
            -
                when "--remotedb"
         | 
| 101 | 
            -
                  options[:remote_db] = ARGV.shift
         | 
| 102 102 | 
             
                when "--externaldb"
         | 
| 103 103 | 
             
                  options[:external_db] = ARGV.shift
         | 
| 104 104 | 
             
                when "--help", "-h"
         | 
| @@ -204,7 +204,7 @@ if ARGV.size > 1 | |
| 204 204 | 
             
              system("ba_raxml")
         | 
| 205 205 |  | 
| 206 206 | 
             
              options = {}
         | 
| 207 | 
            -
              genomes_list = []
         | 
| 207 | 
            +
              genomes_list = []             # TODO multiple input genomes
         | 
| 208 208 |  | 
| 209 209 | 
             
              if ARGV[0] == "annotate"
         | 
| 210 210 |  | 
| @@ -217,7 +217,6 @@ if ARGV.size > 1 | |
| 217 217 |  | 
| 218 218 | 
             
                # Check Options
         | 
| 219 219 | 
             
                if ! options.has_key? :refgenome and
         | 
| 220 | 
            -
                   ! options.has_key? :remote_db and
         | 
| 221 220 | 
             
                   ! options.has_key? :external_db
         | 
| 222 221 | 
             
                  puts "You didn't provide a reference genome or a database for the annotation !"
         | 
| 223 222 | 
             
                elsif ! options.has_key? :input
         | 
| @@ -1,14 +1,13 @@ | |
| 1 1 | 
             
            # -*- coding: utf-8 -*-
         | 
| 2 2 | 
             
            # author:  	maxime déraspe
         | 
| 3 3 | 
             
            # email:	maximilien1er@gmail.com
         | 
| 4 | 
            -
            # review:  	
         | 
| 5 4 | 
             
            # date:    	15-02-24
         | 
| 6 5 | 
             
            # version: 	0.0.1
         | 
| 7 6 | 
             
            # licence:  	
         | 
| 8 7 |  | 
| 9 8 |  | 
| 10 9 |  | 
| 11 | 
            -
            class  | 
| 10 | 
            +
            class SequenceAnnotation
         | 
| 12 11 |  | 
| 13 12 | 
             
              attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
         | 
| 14 13 |  | 
| @@ -67,13 +66,16 @@ class GenbankManip | |
| 67 66 | 
             
                      protId = locustag
         | 
| 68 67 | 
             
                    end
         | 
| 69 68 |  | 
| 70 | 
            -
                    @coding_seq[protId] = { | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 69 | 
            +
                    @coding_seq[protId] = {
         | 
| 70 | 
            +
                      protId: protId,
         | 
| 71 | 
            +
                      location: loc,
         | 
| 72 | 
            +
                      locustag: locustag,
         | 
| 73 | 
            +
                      gene: gene[0],
         | 
| 74 | 
            +
                      product: product[0],
         | 
| 75 | 
            +
                      bioseq: pepBioSeq,
         | 
| 76 | 
            +
                      bioseq_gene: dnaBioSeq,
         | 
| 77 | 
            +
                      bioseq_len: pepBioSeq.length
         | 
| 78 | 
            +
                    }
         | 
| 77 79 | 
             
                  end
         | 
| 78 80 |  | 
| 79 81 | 
             
                end
         | 
| @@ -110,11 +112,13 @@ class GenbankManip | |
| 110 112 | 
             
                    dna = get_DNA(ft,@bioseq)
         | 
| 111 113 | 
             
                    dnaBioSeq = Bio::Sequence.auto(dna)
         | 
| 112 114 |  | 
| 113 | 
            -
                    @rna_seq[locustag] = { | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 115 | 
            +
                    @rna_seq[locustag] = {
         | 
| 116 | 
            +
                      type: ft.feature.to_s,
         | 
| 117 | 
            +
                      location: loc,
         | 
| 118 | 
            +
                      locustag: locustag,
         | 
| 119 | 
            +
                      product: product,
         | 
| 120 | 
            +
                      bioseq_gene: dnaBioSeq
         | 
| 121 | 
            +
                    }
         | 
| 118 122 |  | 
| 119 123 | 
             
                  end
         | 
| 120 124 |  | 
| @@ -125,7 +129,6 @@ class GenbankManip | |
| 125 129 | 
             
              end
         | 
| 126 130 |  | 
| 127 131 |  | 
| 128 | 
            -
             | 
| 129 132 | 
             
              # Print CDS to files
         | 
| 130 133 | 
             
              # RETURN : cds_file path
         | 
| 131 134 | 
             
              def write_cds_to_file outdir
         | 
| @@ -174,12 +177,108 @@ class GenbankManip | |
| 174 177 | 
             
              end
         | 
| 175 178 |  | 
| 176 179 |  | 
| 180 | 
            +
              # add annotation from reference prot synteny
         | 
| 181 | 
            +
              def add_annotation_ref_synteny_prot synteny_prot, annotations, ref_genome=nil
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                contig = @gbk.definition
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                prot_iterator = 0
         | 
| 186 | 
            +
                @gbk.features.each_with_index do |cds, ft_index|
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                  next if cds.feature != "CDS"
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                  prot_iterator+=1
         | 
| 191 | 
            +
                  prot_id = contig+"_"+prot_iterator.to_s
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                  ftArray = []
         | 
| 194 | 
            +
                  cds.qualifiers = []
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                  hit = nil
         | 
| 197 | 
            +
             | 
| 198 | 
            +
                  next if ! synteny_prot.has_key? prot_id or
         | 
| 199 | 
            +
                    ! synteny_prot[prot_id].has_key? :homology
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                  # puts "#{annotations.keys}"
         | 
| 202 | 
            +
                  if annotations.has_key? synteny_prot[prot_id][:homology][:hits][0]
         | 
| 203 | 
            +
                    hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
         | 
| 204 | 
            +
                    # puts hit
         | 
| 205 | 
            +
                  else
         | 
| 206 | 
            +
                    puts "no hit for #{prot_id}"
         | 
| 207 | 
            +
                    next
         | 
| 208 | 
            +
                  end
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                  # hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                  if synteny_prot.has_key? prot_id
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    locus, gene, product, note, inference = nil
         | 
| 215 | 
            +
                    locus = hit[:locustag]
         | 
| 216 | 
            +
                    gene = hit[:gene]
         | 
| 217 | 
            +
                    product = hit[:product]
         | 
| 218 | 
            +
                    note = hit[:note]
         | 
| 219 | 
            +
                    inference = hit[:inference]
         | 
| 220 | 
            +
                    pId = synteny_prot[prot_id][:homology][:pId]
         | 
| 221 | 
            +
                    cov_query = (synteny_prot[prot_id][:homology][:cov_query]*100).round(2)
         | 
| 222 | 
            +
                    cov_subject = (synteny_prot[prot_id][:homology][:cov_subject]*100).round(2)
         | 
| 223 | 
            +
                    reference_prot_id = synteny_prot[prot_id][:homology][:hits][0]
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                    qLocusTag = Bio::Feature::Qualifier.new('locus_tag', "#{prot_id}")
         | 
| 226 | 
            +
                    ftArray.push(qLocusTag)
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                    if gene != nil
         | 
| 229 | 
            +
                      qGene = Bio::Feature::Qualifier.new('gene', gene)
         | 
| 230 | 
            +
                      ftArray.push(qGene)
         | 
| 231 | 
            +
                    end
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    if product != nil
         | 
| 234 | 
            +
                      qProd = Bio::Feature::Qualifier.new('product', product)
         | 
| 235 | 
            +
                      ftArray.push(qProd)
         | 
| 236 | 
            +
                    end
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                    # check if there is a reference genome.. reference_locus shouldn't be nil in that case
         | 
| 239 | 
            +
                    if locus != nil
         | 
| 240 | 
            +
                      qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
         | 
| 241 | 
            +
                      ftArray.push(qNote)
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                      db_source = "[DBSource]"
         | 
| 244 | 
            +
                      if reference_prot_id.include? "_"
         | 
| 245 | 
            +
                        db_source = "RefSeq"
         | 
| 246 | 
            +
                      else
         | 
| 247 | 
            +
                        db_source = "INSD"
         | 
| 248 | 
            +
                      end
         | 
| 249 | 
            +
                      qInference = Bio::Feature::Qualifier.new('inference', "similar to AA sequence:#{db_source}:#{reference_prot_id}")
         | 
| 250 | 
            +
                      ftArray.push(qInference)
         | 
| 251 | 
            +
             | 
| 252 | 
            +
                    end
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    if note != nil
         | 
| 255 | 
            +
                      qNote = Bio::Feature::Qualifier.new('note', note)
         | 
| 256 | 
            +
                      ftArray.push(qNote)
         | 
| 257 | 
            +
                    end
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                    if inference != nil
         | 
| 260 | 
            +
                      qInference = Bio::Feature::Qualifier.new('inference', inference)
         | 
| 261 | 
            +
                      ftArray.push(qInference)
         | 
| 262 | 
            +
                    end
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                  end
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                  cds.qualifiers = ftArray
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                end
         | 
| 269 | 
            +
             | 
| 270 | 
            +
             | 
| 271 | 
            +
              end
         | 
| 272 | 
            +
             | 
| 273 | 
            +
             | 
| 177 274 | 
             
              # add annotation to a genbank file produced by prodigal
         | 
| 178 275 | 
             
              def add_annotations annotations, mode, reference_locus=nil
         | 
| 179 276 |  | 
| 180 277 | 
             
                # nb_of_added_ft = 0
         | 
| 181 278 | 
             
                i = 0
         | 
| 182 279 |  | 
| 280 | 
            +
                fdebug = File.open("debug-add-annotation.txt","w")
         | 
| 281 | 
            +
             | 
| 183 282 | 
             
                contig = @gbk.definition
         | 
| 184 283 |  | 
| 185 284 | 
             
                if mode == "inplace"
         | 
| @@ -195,9 +294,19 @@ class GenbankManip | |
| 195 294 | 
             
                    i += 1
         | 
| 196 295 | 
             
                    prot_id = contig+"_"+i.to_s
         | 
| 197 296 | 
             
                    hit = nil
         | 
| 198 | 
            -
             | 
| 297 | 
            +
             | 
| 298 | 
            +
                    if annotations.has_key? prot_id
         | 
| 299 | 
            +
                      hit = annotations[prot_id]
         | 
| 300 | 
            +
                    else
         | 
| 301 | 
            +
                      puts "no hit for #{prot_id}"
         | 
| 302 | 
            +
                      next
         | 
| 303 | 
            +
                    end
         | 
| 199 304 |  | 
| 200 305 | 
             
                    if hit != nil
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                      fdebug.write(hit)
         | 
| 308 | 
            +
                      fdebug.write("\n")
         | 
| 309 | 
            +
             | 
| 201 310 | 
             
                      locus, gene, product, note = nil
         | 
| 202 311 | 
             
                      locus = hit[:locustag]
         | 
| 203 312 | 
             
                      gene = hit[:gene]
         | 
| @@ -271,6 +380,8 @@ class GenbankManip | |
| 271 380 |  | 
| 272 381 | 
             
                end
         | 
| 273 382 |  | 
| 383 | 
            +
                fdebug.close
         | 
| 384 | 
            +
             | 
| 274 385 | 
             
              end
         | 
| 275 386 |  | 
| 276 387 |  | 
| @@ -315,3 +426,4 @@ class GenbankManip | |
| 315 426 |  | 
| 316 427 |  | 
| 317 428 | 
             
            end                             # end of Class
         | 
| 429 | 
            +
             | 
| @@ -8,29 +8,35 @@ | |
| 8 8 |  | 
| 9 9 |  | 
| 10 10 |  | 
| 11 | 
            -
            class  | 
| 11 | 
            +
            class SequenceFasta
         | 
| 12 12 |  | 
| 13 | 
            -
              attr_reader :fasta_flat, :fasta_file, : | 
| 13 | 
            +
              attr_reader :fasta_flat, :fasta_file, :annotation_files
         | 
| 14 14 |  | 
| 15 15 | 
             
              # Initialize fasta holder
         | 
| 16 16 | 
             
              def initialize fasta_file, meta
         | 
| 17 17 |  | 
| 18 18 | 
             
                @fasta_file = fasta_file
         | 
| 19 19 | 
             
                @fasta_flat = Bio::FlatFile.auto(@fasta_file)
         | 
| 20 | 
            -
                @meta = meta
         | 
| 21 | 
            -
                @prodigal_files = nil
         | 
| 22 | 
            -
                @single_fasta = nil
         | 
| 23 | 
            -
                @seq_info = nil
         | 
| 24 20 |  | 
| 25 21 | 
             
                if @fasta_flat.dbclass != Bio::FastaFormat
         | 
| 26 22 | 
             
                  abort "Aborting : The input sequence is not a fasta file !"
         | 
| 27 23 | 
             
                end
         | 
| 28 24 |  | 
| 25 | 
            +
                # @contigs = extract_contigs(@fasta_flat)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                @meta = meta
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                @annotation_files = nil
         | 
| 30 | 
            +
                @single_fasta = nil
         | 
| 31 | 
            +
                @seq_info = nil
         | 
| 32 | 
            +
             | 
| 29 33 | 
             
              end
         | 
| 30 34 |  | 
| 35 | 
            +
             | 
| 31 36 | 
             
              # Run prodigal on the genome to annotate
         | 
| 32 37 | 
             
              def run_prodigal root, outdir
         | 
| 33 | 
            -
             | 
| 38 | 
            +
             | 
| 39 | 
            +
                @annotation_files = {}
         | 
| 34 40 | 
             
                Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
         | 
| 35 41 | 
             
                if @meta
         | 
| 36 42 | 
             
                  system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
         | 
| @@ -38,30 +44,34 @@ class FastaManip | |
| 38 44 | 
             
                  system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
         | 
| 39 45 | 
             
                end
         | 
| 40 46 |  | 
| 41 | 
            -
                @ | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 47 | 
            +
                @annotation_files = {
         | 
| 48 | 
            +
                  multiGBK: "#{outdir}/Genbanks.gbk",
         | 
| 49 | 
            +
                  contigs: [],
         | 
| 50 | 
            +
                  contigs_length: [],
         | 
| 51 | 
            +
                  genes: "#{outdir}/Genes.fa",
         | 
| 52 | 
            +
                  proteins: "#{outdir}/Proteins.fa",
         | 
| 53 | 
            +
                  prot_ids_by_contig: {},
         | 
| 54 | 
            +
                  fasta_path: "#{outdir}/single-fasta/",
         | 
| 55 | 
            +
                  gbk_path: "#{outdir}/single-genbank/"
         | 
| 56 | 
            +
                }
         | 
| 57 | 
            +
             | 
| 49 58 | 
             
                split_fasta outdir
         | 
| 50 59 | 
             
                split_genbank outdir, "#{outdir}/Genbanks.gbk"
         | 
| 51 60 | 
             
                extract_cds_names
         | 
| 52 | 
            -
                @ | 
| 61 | 
            +
                @annotation_files
         | 
| 62 | 
            +
             | 
| 53 63 | 
             
              end
         | 
| 54 64 |  | 
| 55 65 |  | 
| 56 | 
            -
              # Split Multi  | 
| 66 | 
            +
              # Split Multi Fasta file
         | 
| 57 67 | 
             
              # RETURN : array of fasta files
         | 
| 58 68 | 
             
              def split_fasta outdir
         | 
| 59 69 | 
             
                @single_fasta = {}
         | 
| 60 70 | 
             
                Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
         | 
| 61 71 | 
             
                @fasta_flat.each_entry do |seq|
         | 
| 62 72 | 
             
                  file_name = seq.definition.chomp.split(" ")[0]
         | 
| 63 | 
            -
                  @ | 
| 64 | 
            -
                  @ | 
| 73 | 
            +
                  @annotation_files[:contigs] << "#{file_name}"
         | 
| 74 | 
            +
                  @annotation_files[:contigs_length] << seq.seq.length
         | 
| 65 75 | 
             
                  File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
         | 
| 66 76 | 
             
                    fwrite.write(seq)
         | 
| 67 77 | 
             
                  end
         | 
| @@ -108,7 +118,6 @@ class FastaManip | |
| 108 118 | 
             
                outseq = "ORIGIN\n"
         | 
| 109 119 | 
             
                # puts "ORIGIN"
         | 
| 110 120 |  | 
| 111 | 
            -
                ntNum = 0
         | 
| 112 121 | 
             
                sequence = seq.seq.downcase
         | 
| 113 122 |  | 
| 114 123 | 
             
                nt_left = true
         | 
| @@ -144,7 +153,7 @@ class FastaManip | |
| 144 153 |  | 
| 145 154 | 
             
                prot_ids = {}
         | 
| 146 155 | 
             
                prot_length = {}
         | 
| 147 | 
            -
                flatfile = Bio::FlatFile.auto(@ | 
| 156 | 
            +
                flatfile = Bio::FlatFile.auto(@annotation_files[:proteins])
         | 
| 148 157 |  | 
| 149 158 | 
             
                flatfile.each_entry do |entry|
         | 
| 150 159 | 
             
                  prot_id = entry.definition.split(" ")[0]
         | 
| @@ -163,8 +172,8 @@ class FastaManip | |
| 163 172 | 
             
                  prot_array.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
         | 
| 164 173 | 
             
                end
         | 
| 165 174 |  | 
| 166 | 
            -
                @ | 
| 167 | 
            -
                @ | 
| 175 | 
            +
                @annotation_files[:prot_ids_by_contig] = prot_ids
         | 
| 176 | 
            +
                @annotation_files[:prot_ids_length] = prot_length
         | 
| 168 177 |  | 
| 169 178 | 
             
              end
         | 
| 170 179 |  | 
| @@ -7,20 +7,43 @@ | |
| 7 7 | 
             
            # licence:  	
         | 
| 8 8 |  | 
| 9 9 |  | 
| 10 | 
            +
            class SequenceSynteny
         | 
| 10 11 |  | 
| 11 | 
            -
             | 
| 12 | 
            +
              attr_reader :query_file, :subject_file, :aln_hits, :query_sequences, :subject_sequences
         | 
| 12 13 |  | 
| 13 | 
            -
               | 
| 14 | 
            -
             | 
| 15 | 
            -
              def initialize query_file, subject_file, name, pidentity, type
         | 
| 14 | 
            +
              def initialize query_file, subject_file, name, pidentity, min_coverage, type
         | 
| 16 15 | 
             
                @query_file = query_file
         | 
| 17 16 | 
             
                @subject_file = subject_file
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                @query_sequences = get_sequences(query_file)
         | 
| 19 | 
            +
                @subject_sequences = get_sequences(subject_file)
         | 
| 20 | 
            +
             | 
| 18 21 | 
             
                @name = name
         | 
| 19 22 | 
             
                @pidentity = pidentity
         | 
| 23 | 
            +
                @min_coverage = min_coverage
         | 
| 20 24 | 
             
                @aln_file = nil
         | 
| 21 25 | 
             
                @type = type
         | 
| 26 | 
            +
             | 
| 22 27 | 
             
              end                           # end of initialize
         | 
| 23 28 |  | 
| 29 | 
            +
             | 
| 30 | 
            +
              # get sequences name with length in hash
         | 
| 31 | 
            +
              def get_sequences seq_file
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                sequences = {}
         | 
| 34 | 
            +
                flat = Bio::FlatFile.auto("#{seq_file}")
         | 
| 35 | 
            +
                flat.each_entry do |s|
         | 
| 36 | 
            +
                  s_name = s.definition.chomp.split(" ")[0]
         | 
| 37 | 
            +
                  sequences[s_name] = {}
         | 
| 38 | 
            +
                  sequences[s_name][:length] = s.seq.length
         | 
| 39 | 
            +
                  sequences[s_name][:conserved] = false
         | 
| 40 | 
            +
                  sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                sequences
         | 
| 44 | 
            +
             | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
             | 
| 24 47 | 
             
              # run blat on proteins
         | 
| 25 48 | 
             
              def run_blat root, outdir
         | 
| 26 49 | 
             
                base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
         | 
| @@ -32,9 +55,98 @@ class SyntenyManip | |
| 32 55 | 
             
                # extract_hits
         | 
| 33 56 | 
             
              end                           # end of method
         | 
| 34 57 |  | 
| 58 | 
            +
             | 
| 59 | 
            +
              # Extract Hit from blast8 file and save it in hash
         | 
| 60 | 
            +
              # contig-0_1      ABJ71957.1      96.92   65      2       0       1       65      1       65      9.2e-31 131.0
         | 
| 61 | 
            +
              def extract_hits mode
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                feature = ""
         | 
| 64 | 
            +
                File.open(@aln_file,"r") do |fread|
         | 
| 65 | 
            +
                  while l = fread.gets
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    lA = l.chomp!.split("\t")
         | 
| 68 | 
            +
                    key = lA[0]
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    # extraction of hit id depends on mode ..
         | 
| 71 | 
            +
                    if mode == :refgenome
         | 
| 72 | 
            +
                      hit = lA[1]
         | 
| 73 | 
            +
                      feature = "cds"
         | 
| 74 | 
            +
                    elsif mode == :externaldb
         | 
| 75 | 
            +
                      # hit = lA[1].chomp.split("|")[3]
         | 
| 76 | 
            +
                      hit = lA[1]
         | 
| 77 | 
            +
                      feature = "cds"
         | 
| 78 | 
            +
                    end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    # compute coverage based on sequences length
         | 
| 81 | 
            +
                    cov_query = (lA[3].to_f/@query_sequences[key][:length]).round(2)
         | 
| 82 | 
            +
                    cov_subject = (lA[3].to_f/@subject_sequences[hit][:length]).round(2)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    # assert cutoff on identity and coverage
         | 
| 85 | 
            +
                    # 1 -> pass cutoff, 0 under cutoff
         | 
| 86 | 
            +
                    assert_cutoff = [1,1,1]
         | 
| 87 | 
            +
                    assert_cutoff[0] = 0 if lA[2].to_f < @pidentity
         | 
| 88 | 
            +
                    assert_cutoff[1] = 0 if cov_query < @min_coverage
         | 
| 89 | 
            +
                    assert_cutoff[2] = 0 if cov_subject < @min_coverage
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                    # first hit for query
         | 
| 92 | 
            +
                    if ! @query_sequences[key].has_key? :homology
         | 
| 93 | 
            +
                      @query_sequences[key][:conserved] = true
         | 
| 94 | 
            +
                      @subject_sequences[key][:conserved] = true
         | 
| 95 | 
            +
                      @query_sequences[key][:homology] = {
         | 
| 96 | 
            +
                        pId: lA[2].to_f.round(2),
         | 
| 97 | 
            +
                        cov_query: cov_query,
         | 
| 98 | 
            +
                        cov_subject: cov_subject,
         | 
| 99 | 
            +
                        evalue: lA[10],
         | 
| 100 | 
            +
                        score: lA[11].to_f,
         | 
| 101 | 
            +
                        hits: [hit],
         | 
| 102 | 
            +
                        length: [lA[3].to_i],
         | 
| 103 | 
            +
                        query_location: [[lA[6].to_i,lA[7].to_i]],
         | 
| 104 | 
            +
                        subject_location: [[lA[8].to_i,lA[9].to_i]],
         | 
| 105 | 
            +
                        feature: feature,
         | 
| 106 | 
            +
                        assert_cutoff: assert_cutoff
         | 
| 107 | 
            +
                      }
         | 
| 108 | 
            +
                      @subject_sequences[hit][:hits] = [key]
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    # query already got at least 1 hit and new_score > last_score
         | 
| 111 | 
            +
                    elsif lA[11].to_f > @query_sequences[key][:homology][:score]
         | 
| 112 | 
            +
                      @query_sequences[key][:conserved] = true
         | 
| 113 | 
            +
                      @subject_sequences[key][:conserved] = true
         | 
| 114 | 
            +
                      @query_sequences[key][:homology] = {
         | 
| 115 | 
            +
                        pId: lA[2].to_f.round(2),
         | 
| 116 | 
            +
                        cov_query: cov_query,
         | 
| 117 | 
            +
                        cov_subject: cov_subject,
         | 
| 118 | 
            +
                        evalue: lA[10],
         | 
| 119 | 
            +
                        score: lA[11].to_f,
         | 
| 120 | 
            +
                        hits: [hit],
         | 
| 121 | 
            +
                        length: [lA[3].to_i],
         | 
| 122 | 
            +
                        query_location: [[lA[6].to_i,lA[7].to_i]],
         | 
| 123 | 
            +
                        subject_location: [[lA[8].to_i,lA[9].to_i]],
         | 
| 124 | 
            +
                        feature: feature,
         | 
| 125 | 
            +
                        assert_cutoff: assert_cutoff
         | 
| 126 | 
            +
                      }
         | 
| 127 | 
            +
                      @subject_sequences[hit][:hits] =  [key]
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                    # query already got at least 1 hit and score == last_score
         | 
| 130 | 
            +
                    elsif lA[11].to_f == @query_sequences[key][:homology][:score]
         | 
| 131 | 
            +
                      @query_sequences[key][:homology][:hits] << hit
         | 
| 132 | 
            +
                      @query_sequences[key][:homology][:length] << lA[3].to_i
         | 
| 133 | 
            +
                      @query_sequences[key][:homology][:query_location] << [lA[6].to_i,lA[7].to_i]
         | 
| 134 | 
            +
                      @query_sequences[key][:homology][:subject_location] << [lA[8].to_i,lA[9].to_i]
         | 
| 135 | 
            +
                      if @subject_sequences[hit].has_key? :hits
         | 
| 136 | 
            +
                        @subject_sequences[hit][:hits] << key
         | 
| 137 | 
            +
                      else
         | 
| 138 | 
            +
                        @subject_sequences[hit][:hits] = [key]
         | 
| 139 | 
            +
                      end
         | 
| 140 | 
            +
                    end
         | 
| 141 | 
            +
                  end
         | 
| 142 | 
            +
                end
         | 
| 143 | 
            +
             | 
| 144 | 
            +
              end                           # end of method
         | 
| 145 | 
            +
             | 
| 146 | 
            +
             | 
| 35 147 | 
             
              # Extract Hit from blast8 file and save it in hash
         | 
| 36 148 | 
             
              # contig-0_1      ABJ71957.1      96.92   65      2       0       1       65      1       65      9.2e-31 131.0
         | 
| 37 | 
            -
              def extract_hits_prodigal mode | 
| 149 | 
            +
              def extract_hits_prodigal mode
         | 
| 38 150 |  | 
| 39 151 | 
             
                @aln_hits = {}
         | 
| 40 152 | 
             
                feature = ""
         | 
| @@ -49,8 +161,8 @@ class SyntenyManip | |
| 49 161 | 
             
                      hit = lA[1].chomp.split("|")[3]
         | 
| 50 162 | 
             
                      feature = "cds"
         | 
| 51 163 | 
             
                    end
         | 
| 164 | 
            +
                    next if lA[2].to_f < @pidentity
         | 
| 52 165 | 
             
                    if ! @aln_hits.has_key? key
         | 
| 53 | 
            -
                      next if lA[2].to_f < @pidentity
         | 
| 54 166 | 
             
                      @aln_hits[key] = {
         | 
| 55 167 | 
             
                        pId: lA[2].to_f.round(2),
         | 
| 56 168 | 
             
                        evalue: lA[10],
         | 
| @@ -99,10 +211,12 @@ class SyntenyManip | |
| 99 211 | 
             
                      feature = hit_split[1]
         | 
| 100 212 | 
             
                      product = hit_split[2]
         | 
| 101 213 | 
             
                    end
         | 
| 214 | 
            +
                    next if lA[2].to_f < @pidentity
         | 
| 102 215 | 
             
                    if ! @aln_hits.has_key? key
         | 
| 103 | 
            -
                      next if lA[2].to_f < @pidentity
         | 
| 104 216 | 
             
                      @aln_hits[key] = {
         | 
| 105 217 | 
             
                        pId: lA[2].to_f.round(2),
         | 
| 218 | 
            +
                        # cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
         | 
| 219 | 
            +
                        # cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
         | 
| 106 220 | 
             
                        evalue: lA[10],
         | 
| 107 221 | 
             
                        score: lA[11].to_f,
         | 
| 108 222 | 
             
                        hits: [hit],
         | 
| @@ -115,6 +229,8 @@ class SyntenyManip | |
| 115 229 | 
             
                    elsif lA[11].to_f > @aln_hits[key][:score]
         | 
| 116 230 | 
             
                      @aln_hits[key] = {
         | 
| 117 231 | 
             
                        pId: lA[2].to_f.round(2),
         | 
| 232 | 
            +
                        # cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
         | 
| 233 | 
            +
                        # cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
         | 
| 118 234 | 
             
                        evalue: lA[10],
         | 
| 119 235 | 
             
                        score: lA[11].to_f,
         | 
| 120 236 | 
             
                        hits: [hit],
         | 
| @@ -135,7 +251,7 @@ class SyntenyManip | |
| 135 251 | 
             
                  end
         | 
| 136 252 | 
             
                end
         | 
| 137 253 |  | 
| 138 | 
            -
                prune_aln_hits @aln_hits
         | 
| 254 | 
            +
                # prune_aln_hits @aln_hits
         | 
| 139 255 |  | 
| 140 256 | 
             
              end                           # end of method
         | 
| 141 257 |  | 
| @@ -178,6 +294,10 @@ class SyntenyManip | |
| 178 294 | 
             
                      annotations[p][:length] = @aln_hits[p][:length][hit_index]
         | 
| 179 295 | 
             
                      i+=1
         | 
| 180 296 |  | 
| 297 | 
            +
                      File.open("debug-annotation-by-contig.txt","a") do |fout|
         | 
| 298 | 
            +
                        fout.write("#{p} #{@aln_hits[p][:pId]} #{@aln_hits[p][:cov_query]} #{@aln_hits[p][:cov_subject]} #{ref_cds[h][:product]}\n")
         | 
| 299 | 
            +
                      end
         | 
| 300 | 
            +
             | 
| 181 301 | 
             
                    else
         | 
| 182 302 |  | 
| 183 303 | 
             
                      annotations[p] = nil
         |