bacterial-annotator 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/bin/bacterial-annotator +39 -29
 - data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
 - data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
 - data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
 - data/lib/bacterial-annotator.rb +201 -64
 - data/lib/bacterial-comparator.rb +42 -26
 - data/lib/bacterial-identificator.rb +86 -13
 - metadata +3 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 10f3d2469fb3aaf64e6b84076e05ab9e1ae41cd6
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: f08a5465ce584dd888074c7d0146c1450386598e
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: bd006cf021f0a74f1e98fa6367ca4aca0abb36004f375654ec552b68e1ac8ebc5c1f65e38a480473551848d25ac6be904c5d1841cc60657a47384169d368a18c
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: b5a8cb5c74c028e813bbc585e70b6dcb420b8c8f4ad659e8e4c985bce868009a7f5d6015c4e396768a6146c918943e4586a638c084d844cc91be6ac927c993b6
         
     | 
    
        data/bin/bacterial-annotator
    CHANGED
    
    | 
         @@ -63,27 +63,28 @@ def usage_annotate 
     | 
|
| 
       63 
63 
     | 
    
         
             
            annotate [OPTIONS]
         
     | 
| 
       64 
64 
     | 
    
         | 
| 
       65 
65 
     | 
    
         
             
              // IO
         
     | 
| 
       66 
     | 
    
         
            -
                --input/-i 
     | 
| 
       67 
     | 
    
         
            -
                --outdir/-o 
     | 
| 
       68 
     | 
    
         
            -
                -- 
     | 
| 
       69 
     | 
    
         
            -
                -- 
     | 
| 
      
 66 
     | 
    
         
            +
                --input/-i      <fasta_file>         Provide the fasta file to annotate
         
     | 
| 
      
 67 
     | 
    
         
            +
                --outdir/-o     <outdir>             Output directory [default=BAnnotation]
         
     | 
| 
      
 68 
     | 
    
         
            +
                --name/-n       <name>               Sample name
         
     | 
| 
      
 69 
     | 
    
         
            +
                --force/-f                           Force to overwrite the output directory
         
     | 
| 
       70 
70 
     | 
    
         | 
| 
       71 
71 
     | 
    
         
             
              // MERGEM-based Annotation (Recommended)
         
     | 
| 
       72 
     | 
    
         
            -
                --db/-d 
     | 
| 
      
 72 
     | 
    
         
            +
                --db/-d         <species_dir>        From MERGEM database (include CDS and RNAs fasta)
         
     | 
| 
      
 73 
     | 
    
         
            +
                                                     // see bacteriapps.genome.ulaval.ca/mergem
         
     | 
| 
       73 
74 
     | 
    
         | 
| 
       74 
75 
     | 
    
         
             
              // Reference-Based Annotation
         
     | 
| 
       75 
     | 
    
         
            -
                --refgenome/-g 
     | 
| 
       76 
     | 
    
         
            -
                --externaldb 
     | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
       78 
     | 
    
         
            -
             
     | 
| 
       79 
     | 
    
         
            -
             
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
     | 
    
         
            -
                -- 
     | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
       86 
     | 
    
         
            -
                --meta 
     | 
| 
      
 76 
     | 
    
         
            +
                --refgenome/-g  <GBK_ID>             Provide a Genbank file or a Gbk Accession ID.
         
     | 
| 
      
 77 
     | 
    
         
            +
                --externaldb    <fasta_file>         Finish or do a complete annotation with this sequence database (protein fasta file).
         
     | 
| 
      
 78 
     | 
    
         
            +
                                                     Fasta headers need to look similar to NCBI or EBI fasta headers
         
     | 
| 
      
 79 
     | 
    
         
            +
                                                     EX: >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
         
     | 
| 
      
 80 
     | 
    
         
            +
                                                         >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
              // Options
         
     | 
| 
      
 83 
     | 
    
         
            +
                --pidentity     <% identity>          Minimum percentage identity to incorporate a CDS annotation [default=0.8]
         
     | 
| 
      
 84 
     | 
    
         
            +
                --pcoverage     <% identity>          Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.8]
         
     | 
| 
      
 85 
     | 
    
         
            +
                                                      // otherwise hint for a non-functional protein
         
     | 
| 
      
 86 
     | 
    
         
            +
                --minlength     <length>              Minimum contig length for annotation [default=500]
         
     | 
| 
      
 87 
     | 
    
         
            +
                --meta                                Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
         
     | 
| 
       87 
88 
     | 
    
         | 
| 
       88 
89 
     | 
    
         
             
            OEM
         
     | 
| 
       89 
90 
     | 
    
         | 
| 
         @@ -101,6 +102,11 @@ def parseOptions_annotate 
     | 
|
| 
       101 
102 
     | 
    
         
             
              options[:minlength] = 500
         
     | 
| 
       102 
103 
     | 
    
         
             
              options[:meta] = 0
         
     | 
| 
       103 
104 
     | 
    
         | 
| 
      
 105 
     | 
    
         
            +
              if ARGV.length == 0
         
     | 
| 
      
 106 
     | 
    
         
            +
                usage_annotate
         
     | 
| 
      
 107 
     | 
    
         
            +
                abort
         
     | 
| 
      
 108 
     | 
    
         
            +
              end
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
       104 
110 
     | 
    
         
             
              while x = ARGV.shift
         
     | 
| 
       105 
111 
     | 
    
         | 
| 
       106 
112 
     | 
    
         
             
                case x.downcase
         
     | 
| 
         @@ -224,12 +230,14 @@ def usage_identify 
     | 
|
| 
       224 
230 
     | 
    
         | 
| 
       225 
231 
     | 
    
         
             
            identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
         
     | 
| 
       226 
232 
     | 
    
         | 
| 
       227 
     | 
    
         
            -
              // 
     | 
| 
       228 
     | 
    
         
            -
                -- 
     | 
| 
      
 233 
     | 
    
         
            +
              //Mash Sketch
         
     | 
| 
      
 234 
     | 
    
         
            +
                --mash/-m      <mash sketch file>
         
     | 
| 
       229 
235 
     | 
    
         | 
| 
       230 
236 
     | 
    
         
             
              //IO
         
     | 
| 
       231 
237 
     | 
    
         
             
                --proc         <nb of process> Number of process to run the comparison
         
     | 
| 
       232 
238 
     | 
    
         | 
| 
      
 239 
     | 
    
         
            +
                --output       [csv,tsv|json]
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
       233 
241 
     | 
    
         
             
            OEM
         
     | 
| 
       234 
242 
     | 
    
         | 
| 
       235 
243 
     | 
    
         
             
            end
         
     | 
| 
         @@ -238,21 +246,24 @@ def parseOptions_identify 
     | 
|
| 
       238 
246 
     | 
    
         | 
| 
       239 
247 
     | 
    
         
             
              options = {}
         
     | 
| 
       240 
248 
     | 
    
         
             
              options[:proc] = 2
         
     | 
| 
       241 
     | 
    
         
            -
              options[: 
     | 
| 
      
 249 
     | 
    
         
            +
              options[:genome_list] = []
         
     | 
| 
      
 250 
     | 
    
         
            +
              options[:output] = "tsv"
         
     | 
| 
       242 
251 
     | 
    
         | 
| 
       243 
252 
     | 
    
         
             
              while x = ARGV.shift
         
     | 
| 
       244 
253 
     | 
    
         | 
| 
       245 
254 
     | 
    
         
             
                case x.downcase
         
     | 
| 
       246 
     | 
    
         
            -
                when "-- 
     | 
| 
       247 
     | 
    
         
            -
                  options[: 
     | 
| 
      
 255 
     | 
    
         
            +
                when "--mash", "-m"
         
     | 
| 
      
 256 
     | 
    
         
            +
                  options[:mash_file] = ARGV.shift
         
     | 
| 
       248 
257 
     | 
    
         
             
                when "--proc", "-p"
         
     | 
| 
       249 
258 
     | 
    
         
             
                  options[:proc] = ARGV.shift
         
     | 
| 
      
 259 
     | 
    
         
            +
                when "--output", "-o"
         
     | 
| 
      
 260 
     | 
    
         
            +
                  options[:output] = ARGV.shift
         
     | 
| 
       250 
261 
     | 
    
         
             
                when "--help", "-h"
         
     | 
| 
       251 
262 
     | 
    
         
             
                  usage_identify
         
     | 
| 
       252 
263 
     | 
    
         
             
                  abort
         
     | 
| 
       253 
264 
     | 
    
         
             
                else
         
     | 
| 
       254 
265 
     | 
    
         
             
                  if File.exists? "#{x}"
         
     | 
| 
       255 
     | 
    
         
            -
                    options[: 
     | 
| 
      
 266 
     | 
    
         
            +
                    options[:genome_list] << x
         
     | 
| 
       256 
267 
     | 
    
         
             
                  else
         
     | 
| 
       257 
268 
     | 
    
         
             
                    puts "#{x} file doesn't exist"
         
     | 
| 
       258 
269 
     | 
    
         
             
                    usage_identify
         
     | 
| 
         @@ -302,14 +313,14 @@ if ARGV.size >= 1 
     | 
|
| 
       302 
313 
     | 
    
         | 
| 
       303 
314 
     | 
    
         
             
                # Check Options
         
     | 
| 
       304 
315 
     | 
    
         
             
                if ! options.has_key? :refgenome and
         
     | 
| 
       305 
     | 
    
         
            -
                   ! options.has_key? :external_db
         
     | 
| 
      
 316 
     | 
    
         
            +
                   ! options.has_key? :external_db and
         
     | 
| 
      
 317 
     | 
    
         
            +
                   ! options.has_key? :mergem
         
     | 
| 
       306 
318 
     | 
    
         
             
                  puts "You didn't provide a reference genome or a database for the annotation !"
         
     | 
| 
       307 
319 
     | 
    
         
             
                elsif ! options.has_key? :input
         
     | 
| 
       308 
320 
     | 
    
         
             
                  puts "You didn't provide a fasta file to annotate !"
         
     | 
| 
       309 
321 
     | 
    
         
             
                end
         
     | 
| 
       310 
322 
     | 
    
         | 
| 
       311 
323 
     | 
    
         
             
                bannot = BacterialAnnotator.new(options, ROOT)
         
     | 
| 
       312 
     | 
    
         
            -
                bannot.prepare_files_for_annotation
         
     | 
| 
       313 
324 
     | 
    
         
             
                bannot.run_annotation
         
     | 
| 
       314 
325 
     | 
    
         | 
| 
       315 
326 
     | 
    
         
             
              elsif ARGV[0] == "compare"
         
     | 
| 
         @@ -317,20 +328,19 @@ if ARGV.size >= 1 
     | 
|
| 
       317 
328 
     | 
    
         
             
                ARGV.shift
         
     | 
| 
       318 
329 
     | 
    
         
             
                options = parseOptions_compare
         
     | 
| 
       319 
330 
     | 
    
         
             
                bcomp = BacterialComparator.new(options, ROOT)
         
     | 
| 
       320 
     | 
    
         
            -
                 
     | 
| 
       321 
     | 
    
         
            -
                bcomp.mafft_aln aln_opt
         
     | 
| 
       322 
     | 
    
         
            -
                bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
         
     | 
| 
      
 331 
     | 
    
         
            +
                bcomp.run_comparison
         
     | 
| 
       323 
332 
     | 
    
         | 
| 
       324 
333 
     | 
    
         
             
              elsif ARGV[0] == "identify"
         
     | 
| 
       325 
334 
     | 
    
         | 
| 
       326 
335 
     | 
    
         
             
                ARGV.shift
         
     | 
| 
       327 
336 
     | 
    
         
             
                options = parseOptions_identify
         
     | 
| 
       328 
     | 
    
         
            -
                if options[: 
     | 
| 
      
 337 
     | 
    
         
            +
                if options[:genome_list].empty?
         
     | 
| 
       329 
338 
     | 
    
         
             
                  puts "You need at least 1 genome fasta to identify !!"
         
     | 
| 
       330 
339 
     | 
    
         
             
                  usage_identify
         
     | 
| 
       331 
340 
     | 
    
         
             
                  abort
         
     | 
| 
       332 
341 
     | 
    
         
             
                end
         
     | 
| 
       333 
342 
     | 
    
         
             
                bident = BacterialIdentificator.new(options, ROOT)
         
     | 
| 
      
 343 
     | 
    
         
            +
                bident.run_identification
         
     | 
| 
       334 
344 
     | 
    
         | 
| 
       335 
345 
     | 
    
         
             
              elsif ARGV[0] == "--version" or ARGV[0] == "-v"
         
     | 
| 
       336 
346 
     | 
    
         | 
| 
         @@ -5,27 +5,208 @@ 
     | 
|
| 
       5 
5 
     | 
    
         
             
            # version: 	0.0.1
         
     | 
| 
       6 
6 
     | 
    
         
             
            # licence:  	
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
      
 8 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 9 
     | 
    
         
            +
            require 'zlib'
         
     | 
| 
       8 
10 
     | 
    
         | 
| 
       9 
11 
     | 
    
         | 
| 
       10 
12 
     | 
    
         
             
            class SequenceAnnotation
         
     | 
| 
       11 
13 
     | 
    
         | 
| 
       12 
     | 
    
         
            -
              attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
         
     | 
| 
      
 14 
     | 
    
         
            +
              attr_accessor :gbk, :coding_seq, :rna_seq, :cds_file, :rna_file
         
     | 
| 
       13 
15 
     | 
    
         | 
| 
       14 
16 
     | 
    
         
             
              # Initialize then genbank file
         
     | 
| 
       15 
     | 
    
         
            -
              def initialize  
     | 
| 
      
 17 
     | 
    
         
            +
              def initialize root, outdir, file_ref, type
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                @root = root
         
     | 
| 
      
 20 
     | 
    
         
            +
                @outdir = outdir
         
     | 
| 
      
 21 
     | 
    
         
            +
                @coding_seq = {}
         
     | 
| 
      
 22 
     | 
    
         
            +
                @rna_seq = {}
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                case type
         
     | 
| 
      
 25 
     | 
    
         
            +
                when "refGbk"
         
     | 
| 
      
 26 
     | 
    
         
            +
                  # reference genome use for annotation
         
     | 
| 
      
 27 
     | 
    
         
            +
                  reference_gbk file_ref
         
     | 
| 
      
 28 
     | 
    
         
            +
                when "db"
         
     | 
| 
      
 29 
     | 
    
         
            +
                  # reference database use for annotation
         
     | 
| 
      
 30 
     | 
    
         
            +
                  reference_db file_ref
         
     | 
| 
      
 31 
     | 
    
         
            +
                when "fasta"
         
     | 
| 
      
 32 
     | 
    
         
            +
                  # single fasta database for annotation (completion)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  single_fasta file_ref
         
     | 
| 
      
 34 
     | 
    
         
            +
                when "newGbk"
         
     | 
| 
      
 35 
     | 
    
         
            +
                  # new genbank holder to be annotated
         
     | 
| 
      
 36 
     | 
    
         
            +
                  new_gbk file_ref
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
              end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
              # Use a MERGEM database to get annotation from it
         
     | 
| 
      
 43 
     | 
    
         
            +
              def reference_db dir
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                abort "Aborting: Can't find MERGEM db direcotry" if ! File.exists? dir
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                @cds_file = "#{dir}/cds.dmnd"
         
     | 
| 
      
 48 
     | 
    
         
            +
                @rna_file = "#{dir}/rnas.fasta"
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                json_genes = {}
         
     | 
| 
      
 51 
     | 
    
         
            +
                Zlib::GzipReader.open("#{dir}/cds.json.gz") {|gz|
         
     | 
| 
      
 52 
     | 
    
         
            +
                  json_genes = JSON.parse(gz.read)
         
     | 
| 
      
 53 
     | 
    
         
            +
                }
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                json_genes.each do |gene|
         
     | 
| 
       16 
56 
     | 
    
         | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
      
 57 
     | 
    
         
            +
                  prot_id = gene["cluster_id"]
         
     | 
| 
      
 58 
     | 
    
         
            +
                  @coding_seq[prot_id] = {
         
     | 
| 
      
 59 
     | 
    
         
            +
                    protId: prot_id,
         
     | 
| 
      
 60 
     | 
    
         
            +
                    location: nil,
         
     | 
| 
      
 61 
     | 
    
         
            +
                    product: gene["consensus_name"],
         
     | 
| 
      
 62 
     | 
    
         
            +
                    length: gene["consensus_length"]
         
     | 
| 
      
 63 
     | 
    
         
            +
                  }
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                end
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                # File.open("#{dir}/cds.txt") do |f|
         
     | 
| 
      
 68 
     | 
    
         
            +
                #   while l = f.gets
         
     | 
| 
      
 69 
     | 
    
         
            +
                #     lA = l.chomp.split(" ")
         
     | 
| 
      
 70 
     | 
    
         
            +
                #     @coding_seq[lA[0].gsub(">","")] = {
         
     | 
| 
      
 71 
     | 
    
         
            +
                #       protId: lA[0].gsub(">",""),
         
     | 
| 
      
 72 
     | 
    
         
            +
                #       location: nil,
         
     | 
| 
      
 73 
     | 
    
         
            +
                #       product: lA[1..-1].join(' '),
         
     | 
| 
      
 74 
     | 
    
         
            +
                #     }
         
     | 
| 
      
 75 
     | 
    
         
            +
                #   end
         
     | 
| 
      
 76 
     | 
    
         
            +
                # end
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                File.open("#{dir}/rnas.txt") do |f|
         
     | 
| 
      
 79 
     | 
    
         
            +
                  while l = f.gets
         
     | 
| 
      
 80 
     | 
    
         
            +
                    lA = l.chomp.split(" ")
         
     | 
| 
      
 81 
     | 
    
         
            +
                    @rna_seq[lA[0].gsub(">","")] = {
         
     | 
| 
      
 82 
     | 
    
         
            +
                      protId: lA[0].gsub(">",""),
         
     | 
| 
      
 83 
     | 
    
         
            +
                      location: nil,
         
     | 
| 
      
 84 
     | 
    
         
            +
                      product: lA[1..-1].join(' '),
         
     | 
| 
      
 85 
     | 
    
         
            +
                    }
         
     | 
| 
      
 86 
     | 
    
         
            +
                  end
         
     | 
| 
       22 
87 
     | 
    
         
             
                end
         
     | 
| 
       23 
88 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
      
 89 
     | 
    
         
            +
              end
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
              # Use a Genbank Reference and read annotation from it
         
     | 
| 
      
 92 
     | 
    
         
            +
              def reference_gbk gbk_file
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                puts "# Preparing reference genome files.."
         
     | 
| 
      
 95 
     | 
    
         
            +
                if ! File.exists? gbk_file
         
     | 
| 
      
 96 
     | 
    
         
            +
                  fetch_ncbi_genome(gbk_file)
         
     | 
| 
      
 97 
     | 
    
         
            +
                  gbk_file = "#{@outdir}/#{gbk_file}.gbk"
         
     | 
| 
      
 98 
     | 
    
         
            +
                  # gbk_file += ".gbk"
         
     | 
| 
      
 99 
     | 
    
         
            +
                end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                flat_gbk = Bio::FlatFile.auto(gbk_file)
         
     | 
| 
       25 
102 
     | 
    
         | 
| 
       26 
103 
     | 
    
         
             
                # Check if gbk is valid
         
     | 
| 
       27 
104 
     | 
    
         
             
                if flat_gbk.dbclass != Bio::GenBank
         
     | 
| 
       28 
     | 
    
         
            -
                  abort "Aborting : The input #{ 
     | 
| 
      
 105 
     | 
    
         
            +
                  abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
         
     | 
| 
      
 106 
     | 
    
         
            +
                else
         
     | 
| 
      
 107 
     | 
    
         
            +
                  @gbk = flat_gbk.next_entry
         
     | 
| 
      
 108 
     | 
    
         
            +
                end
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
                @bioseq = @gbk.to_biosequence
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                write_cds_to_file
         
     | 
| 
      
 113 
     | 
    
         
            +
                write_rna_to_file
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
              end
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
              # Use a Genbank Reference and read annotation from it
         
     | 
| 
      
 118 
     | 
    
         
            +
              def single_fasta fasta_file
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                return "" if ! File.exists? fasta_file
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                File.open(fasta_file, "r") do |dbfile|
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
                  while l=dbfile.gets
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                    if l[0] == ">"
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                      lA = l.chomp.split("|")
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
                      if lA.length > 1      # refseq, ncbi, trembl, swissprot
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
                        key_gi = l.split(" ")[0][1..-1]
         
     | 
| 
      
 133 
     | 
    
         
            +
                        product_long = lA[-1]
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
                        organism = ""
         
     | 
| 
      
 136 
     | 
    
         
            +
                        product = ""
         
     | 
| 
      
 137 
     | 
    
         
            +
                        db_source = "[DBSource]"
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                        if product_long.scan(/|/).count >= 5 # FROM BIORUBY SCRIPTS
         
     | 
| 
      
 140 
     | 
    
         
            +
                          product = product_long
         
     | 
| 
      
 141 
     | 
    
         
            +
                          db_source = "RefSeq"
         
     | 
| 
      
 142 
     | 
    
         
            +
                        elsif product_long.include? " [" and product_long.include? "]" # NCBI
         
     | 
| 
      
 143 
     | 
    
         
            +
                          organism = product_long[/\[.*?\]/]
         
     | 
| 
      
 144 
     | 
    
         
            +
                          product = product_long.split(" [")[0].strip
         
     | 
| 
      
 145 
     | 
    
         
            +
                        elsif product_long.include? "OS=" # Swissprot / TrEMBL
         
     | 
| 
      
 146 
     | 
    
         
            +
                          product_tmp = product.split("OS=")
         
     | 
| 
      
 147 
     | 
    
         
            +
                          organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
         
     | 
| 
      
 148 
     | 
    
         
            +
                          product = product_tmp[0].strip
         
     | 
| 
      
 149 
     | 
    
         
            +
                        elsif product_long.include? "[A-Z][A-Z]=" # NCBI
         
     | 
| 
      
 150 
     | 
    
         
            +
                          product = product_long.split(/[A-Z][A-Z]=/)[0].strip
         
     | 
| 
      
 151 
     | 
    
         
            +
                        else
         
     | 
| 
      
 152 
     | 
    
         
            +
                          product = product_long
         
     | 
| 
      
 153 
     | 
    
         
            +
                        end
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                        org = organism.gsub("[","").gsub("]","")
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                        product.lstrip!
         
     | 
| 
      
 158 
     | 
    
         
            +
                        prot_id = nil
         
     | 
| 
      
 159 
     | 
    
         
            +
             
     | 
| 
      
 160 
     | 
    
         
            +
                        if key_gi.count("|") == 4
         
     | 
| 
      
 161 
     | 
    
         
            +
                          if lA[2] == "ref"
         
     | 
| 
      
 162 
     | 
    
         
            +
                            db_source = "RefSeq"
         
     | 
| 
      
 163 
     | 
    
         
            +
                          end
         
     | 
| 
      
 164 
     | 
    
         
            +
                          prot_id = lA[3]
         
     | 
| 
      
 165 
     | 
    
         
            +
                        elsif key_gi.count("|") == 2
         
     | 
| 
      
 166 
     | 
    
         
            +
                          if lA[0].include? == "sp" or
         
     | 
| 
      
 167 
     | 
    
         
            +
                            lA[0].include? == "tr"
         
     | 
| 
      
 168 
     | 
    
         
            +
                            db_source = "UniProtKB"
         
     | 
| 
      
 169 
     | 
    
         
            +
                          end
         
     | 
| 
      
 170 
     | 
    
         
            +
                          prot_id = lA[1]
         
     | 
| 
      
 171 
     | 
    
         
            +
                        elsif key_gi.count("|") == 5
         
     | 
| 
      
 172 
     | 
    
         
            +
                          db_source = "RefSeq"
         
     | 
| 
      
 173 
     | 
    
         
            +
                          prot_id = lA[2]
         
     | 
| 
      
 174 
     | 
    
         
            +
                        end
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
      
 177 
     | 
    
         
            +
                      else                  # mergem
         
     | 
| 
      
 178 
     | 
    
         
            +
             
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
                      end
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
                      @coding_seq[key_gi] = { product: product,
         
     | 
| 
      
 183 
     | 
    
         
            +
                                              org: org,
         
     | 
| 
      
 184 
     | 
    
         
            +
                                              prot_id: prot_id,
         
     | 
| 
      
 185 
     | 
    
         
            +
                                              db_source: db_source }
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
                    end
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
      
 189 
     | 
    
         
            +
                  end
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                end
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
              end
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
              # New Genbank Holder to add annotation to it
         
     | 
| 
      
 197 
     | 
    
         
            +
              def new_gbk gbk_file
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
                if ! File.exists? gbk_file
         
     | 
| 
      
 200 
     | 
    
         
            +
                  fetch_ncbi_genome(gbk_file)
         
     | 
| 
      
 201 
     | 
    
         
            +
                  gbk_file = "#{@outdir}/#{gbk_file}.gbk"
         
     | 
| 
      
 202 
     | 
    
         
            +
                  # gbk_file += ".gbk"
         
     | 
| 
      
 203 
     | 
    
         
            +
                end
         
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
                flat_gbk = Bio::FlatFile.auto(gbk_file)
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
                # Check if gbk is valid
         
     | 
| 
      
 208 
     | 
    
         
            +
                if flat_gbk.dbclass != Bio::GenBank
         
     | 
| 
      
 209 
     | 
    
         
            +
                  abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
         
     | 
| 
       29 
210 
     | 
    
         
             
                else
         
     | 
| 
       30 
211 
     | 
    
         
             
                  @gbk = flat_gbk.next_entry
         
     | 
| 
       31 
212 
     | 
    
         
             
                end
         
     | 
| 
         @@ -38,9 +219,7 @@ class SequenceAnnotation 
     | 
|
| 
       38 
219 
     | 
    
         
             
              # Prepare CDS/proteins
         
     | 
| 
       39 
220 
     | 
    
         
             
              def get_cds
         
     | 
| 
       40 
221 
     | 
    
         | 
| 
       41 
     | 
    
         
            -
                if @coding_seq 
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
                  @coding_seq = {}
         
     | 
| 
      
 222 
     | 
    
         
            +
                if @coding_seq.empty?
         
     | 
| 
       44 
223 
     | 
    
         | 
| 
       45 
224 
     | 
    
         
             
                  # Iterate over each CDS
         
     | 
| 
       46 
225 
     | 
    
         
             
                  @gbk.each_cds do |ft|
         
     | 
| 
         @@ -74,7 +253,7 @@ class SequenceAnnotation 
     | 
|
| 
       74 
253 
     | 
    
         
             
                      product: product[0],
         
     | 
| 
       75 
254 
     | 
    
         
             
                      bioseq: pepBioSeq,
         
     | 
| 
       76 
255 
     | 
    
         
             
                      bioseq_gene: dnaBioSeq,
         
     | 
| 
       77 
     | 
    
         
            -
                       
     | 
| 
      
 256 
     | 
    
         
            +
                      length: pepBioSeq.length
         
     | 
| 
       78 
257 
     | 
    
         
             
                    }
         
     | 
| 
       79 
258 
     | 
    
         | 
| 
       80 
259 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -88,12 +267,12 @@ class SequenceAnnotation 
     | 
|
| 
       88 
267 
     | 
    
         
             
              # Prepare rRNA tRNA
         
     | 
| 
       89 
268 
     | 
    
         
             
              def get_rna
         
     | 
| 
       90 
269 
     | 
    
         | 
| 
       91 
     | 
    
         
            -
                if @rna_seq 
     | 
| 
      
 270 
     | 
    
         
            +
                if @rna_seq.empty?
         
     | 
| 
       92 
271 
     | 
    
         | 
| 
       93 
272 
     | 
    
         
             
                  @rna_seq = {}
         
     | 
| 
       94 
273 
     | 
    
         
             
                  @gbk.features do |ft|
         
     | 
| 
       95 
274 
     | 
    
         | 
| 
       96 
     | 
    
         
            -
                    next if ! ft.feature.to_s.include? " 
     | 
| 
      
 275 
     | 
    
         
            +
                    next if ! ft.feature.to_s.include? "rRNA"
         
     | 
| 
       97 
276 
     | 
    
         | 
| 
       98 
277 
     | 
    
         
             
                    ftH = ft.to_hash
         
     | 
| 
       99 
278 
     | 
    
         
             
                    loc = ft.locations
         
     | 
| 
         @@ -129,20 +308,19 @@ class SequenceAnnotation 
     | 
|
| 
       129 
308 
     | 
    
         | 
| 
       130 
309 
     | 
    
         
             
              end
         
     | 
| 
       131 
310 
     | 
    
         | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
311 
     | 
    
         
             
              # Print CDS to files
         
     | 
| 
       134 
312 
     | 
    
         
             
              # RETURN : cds_file path
         
     | 
| 
       135 
     | 
    
         
            -
              def write_cds_to_file 
     | 
| 
      
 313 
     | 
    
         
            +
              def write_cds_to_file
         
     | 
| 
       136 
314 
     | 
    
         | 
| 
       137 
315 
     | 
    
         
             
                cds_file = "#{@gbk.accession}.pep"
         
     | 
| 
       138 
316 
     | 
    
         
             
                dna_file = "#{@gbk.accession}.dna"
         
     | 
| 
       139 
317 
     | 
    
         | 
| 
       140 
     | 
    
         
            -
                if @coding_seq 
     | 
| 
      
 318 
     | 
    
         
            +
                if @coding_seq.empty?
         
     | 
| 
       141 
319 
     | 
    
         
             
                  get_cds
         
     | 
| 
       142 
320 
     | 
    
         
             
                end
         
     | 
| 
       143 
321 
     | 
    
         | 
| 
       144 
     | 
    
         
            -
                dna_out = File.open("#{outdir}/#{dna_file}", "w")
         
     | 
| 
       145 
     | 
    
         
            -
                File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
         
     | 
| 
      
 322 
     | 
    
         
            +
                dna_out = File.open("#{@outdir}/#{dna_file}", "w")
         
     | 
| 
      
 323 
     | 
    
         
            +
                File.open("#{@outdir}/#{cds_file}", "w") do |fwrite|
         
     | 
| 
       146 
324 
     | 
    
         
             
                  @coding_seq.each_key do |k|
         
     | 
| 
       147 
325 
     | 
    
         
             
                    seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
         
     | 
| 
       148 
326 
     | 
    
         
             
                    seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
         
     | 
| 
         @@ -152,28 +330,28 @@ class SequenceAnnotation 
     | 
|
| 
       152 
330 
     | 
    
         
             
                end
         
     | 
| 
       153 
331 
     | 
    
         
             
                dna_out.close
         
     | 
| 
       154 
332 
     | 
    
         | 
| 
       155 
     | 
    
         
            -
                @cds_file = "#{outdir}/" + cds_file
         
     | 
| 
      
 333 
     | 
    
         
            +
                @cds_file = "#{@outdir}/" + cds_file
         
     | 
| 
       156 
334 
     | 
    
         | 
| 
       157 
335 
     | 
    
         
             
              end
         
     | 
| 
       158 
336 
     | 
    
         | 
| 
       159 
337 
     | 
    
         
             
              # Print RNA to files
         
     | 
| 
       160 
338 
     | 
    
         
             
              # RETURN : rna_file path
         
     | 
| 
       161 
     | 
    
         
            -
              def write_rna_to_file 
     | 
| 
      
 339 
     | 
    
         
            +
              def write_rna_to_file
         
     | 
| 
       162 
340 
     | 
    
         | 
| 
       163 
341 
     | 
    
         
             
                rna_file = "#{@gbk.accession}.rna"
         
     | 
| 
       164 
342 
     | 
    
         | 
| 
       165 
     | 
    
         
            -
                if @rna_seq 
     | 
| 
      
 343 
     | 
    
         
            +
                if @rna_seq.empty?
         
     | 
| 
       166 
344 
     | 
    
         
             
                  get_rna
         
     | 
| 
       167 
345 
     | 
    
         
             
                end
         
     | 
| 
       168 
346 
     | 
    
         | 
| 
       169 
     | 
    
         
            -
                File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
         
     | 
| 
      
 347 
     | 
    
         
            +
                File.open("#{@outdir}/#{rna_file}", "w") do |fwrite|
         
     | 
| 
       170 
348 
     | 
    
         
             
                  @rna_seq.each_key do |k|
         
     | 
| 
       171 
349 
     | 
    
         
             
                    seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
         
     | 
| 
       172 
350 
     | 
    
         
             
                    fwrite.write(seqout_dna)
         
     | 
| 
       173 
351 
     | 
    
         
             
                  end
         
     | 
| 
       174 
352 
     | 
    
         
             
                end
         
     | 
| 
       175 
353 
     | 
    
         | 
| 
       176 
     | 
    
         
            -
                @rna_file = "#{outdir}/" + rna_file
         
     | 
| 
      
 354 
     | 
    
         
            +
                @rna_file = "#{@outdir}/" + rna_file
         
     | 
| 
       177 
355 
     | 
    
         | 
| 
       178 
356 
     | 
    
         
             
              end
         
     | 
| 
       179 
357 
     | 
    
         | 
| 
         @@ -247,6 +425,7 @@ class SequenceAnnotation 
     | 
|
| 
       247 
425 
     | 
    
         | 
| 
       248 
426 
     | 
    
         
             
                      # check if there is a reference genome.. reference_locus shouldn't be nil in that case
         
     | 
| 
       249 
427 
     | 
    
         
             
                      if locus != nil
         
     | 
| 
      
 428 
     | 
    
         
            +
             
     | 
| 
       250 
429 
     | 
    
         
             
                        qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
         
     | 
| 
       251 
430 
     | 
    
         
             
                        ftArray.push(qNote)
         
     | 
| 
       252 
431 
     | 
    
         | 
| 
         @@ -390,9 +569,9 @@ class SequenceAnnotation 
     | 
|
| 
       390 
569 
     | 
    
         
             
              end
         
     | 
| 
       391 
570 
     | 
    
         | 
| 
       392 
571 
     | 
    
         | 
| 
       393 
     | 
    
         
            -
              def save_genbank_to_file 
     | 
| 
      
 572 
     | 
    
         
            +
              def save_genbank_to_file
         
     | 
| 
       394 
573 
     | 
    
         | 
| 
       395 
     | 
    
         
            -
                File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
         
     | 
| 
      
 574 
     | 
    
         
            +
                File.open("#{@outdir}/#{@gbk.definition}.gbk", "w") do |f|
         
     | 
| 
       396 
575 
     | 
    
         
             
                  f.write(@gbk.to_biosequence.output(:genbank))
         
     | 
| 
       397 
576 
     | 
    
         
             
                end
         
     | 
| 
       398 
577 
     | 
    
         | 
| 
         @@ -403,7 +582,7 @@ class SequenceAnnotation 
     | 
|
| 
       403 
582 
     | 
    
         
             
              ###################
         
     | 
| 
       404 
583 
     | 
    
         | 
| 
       405 
584 
     | 
    
         
             
              # Fct: Get dna sequence
         
     | 
| 
       406 
     | 
    
         
            -
              def get_DNA  
     | 
| 
      
 585 
     | 
    
         
            +
              def get_DNA cds, seq
         
     | 
| 
       407 
586 
     | 
    
         
             
                loc = cds.locations
         
     | 
| 
       408 
587 
     | 
    
         
             
                sbeg = loc[0].from.to_i
         
     | 
| 
       409 
588 
     | 
    
         
             
                send = loc[0].to.to_i
         
     | 
| 
         @@ -418,11 +597,11 @@ class SequenceAnnotation 
     | 
|
| 
       418 
597 
     | 
    
         | 
| 
       419 
598 
     | 
    
         | 
| 
       420 
599 
     | 
    
         
             
              # Fetch genbank genome from NCBI
         
     | 
| 
       421 
     | 
    
         
            -
              def fetch_ncbi_genome refgenome_id 
     | 
| 
      
 600 
     | 
    
         
            +
              def fetch_ncbi_genome refgenome_id
         
     | 
| 
       422 
601 
     | 
    
         
             
                Bio::NCBI.default_email = 'default@default.com'
         
     | 
| 
       423 
602 
     | 
    
         
             
                ncbi = Bio::NCBI::REST.new
         
     | 
| 
       424 
603 
     | 
    
         
             
                genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
         
     | 
| 
       425 
     | 
    
         
            -
                File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
         
     | 
| 
      
 604 
     | 
    
         
            +
                File.open("#{@outdir}/#{refgenome_id}.gbk", "w") do |f|
         
     | 
| 
       426 
605 
     | 
    
         
             
                  f.write(genbankstring)
         
     | 
| 
       427 
606 
     | 
    
         
             
                end
         
     | 
| 
       428 
607 
     | 
    
         
             
              end
         
     | 
| 
         @@ -13,8 +13,10 @@ class SequenceFasta 
     | 
|
| 
       13 
13 
     | 
    
         
             
              attr_reader :fasta_flat, :fasta_file, :annotation_files
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
15 
     | 
    
         
             
              # Initialize fasta holder
         
     | 
| 
       16 
     | 
    
         
            -
              def initialize fasta_file, meta
         
     | 
| 
      
 16 
     | 
    
         
            +
              def initialize root, outdir, fasta_file, meta
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
      
 18 
     | 
    
         
            +
                @root = root
         
     | 
| 
      
 19 
     | 
    
         
            +
                @outdir = outdir
         
     | 
| 
       18 
20 
     | 
    
         
             
                @fasta_file = fasta_file
         
     | 
| 
       19 
21 
     | 
    
         
             
                @fasta_flat = Bio::FlatFile.auto(@fasta_file)
         
     | 
| 
       20 
22 
     | 
    
         | 
| 
         @@ -32,29 +34,29 @@ class SequenceFasta 
     | 
|
| 
       32 
34 
     | 
    
         | 
| 
       33 
35 
     | 
    
         | 
| 
       34 
36 
     | 
    
         
             
              # Run prodigal on the genome to annotate
         
     | 
| 
       35 
     | 
    
         
            -
              def run_prodigal 
     | 
| 
      
 37 
     | 
    
         
            +
              def run_prodigal
         
     | 
| 
       36 
38 
     | 
    
         | 
| 
       37 
39 
     | 
    
         
             
                @annotation_files = {}
         
     | 
| 
       38 
     | 
    
         
            -
                Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
         
     | 
| 
      
 40 
     | 
    
         
            +
                Dir.mkdir "#{@outdir}" if ! Dir.exists? "#{@outdir}"
         
     | 
| 
       39 
41 
     | 
    
         
             
                if @meta==1
         
     | 
| 
       40 
     | 
    
         
            -
                  system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
         
     | 
| 
      
 42 
     | 
    
         
            +
                  system("#{@root}/prodigal.linux -p meta -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
         
     | 
| 
       41 
43 
     | 
    
         
             
                else
         
     | 
| 
       42 
     | 
    
         
            -
                  system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
         
     | 
| 
      
 44 
     | 
    
         
            +
                  system("#{@root}/prodigal.linux -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
         
     | 
| 
       43 
45 
     | 
    
         
             
                end
         
     | 
| 
       44 
46 
     | 
    
         | 
| 
       45 
47 
     | 
    
         
             
                @annotation_files = {
         
     | 
| 
       46 
     | 
    
         
            -
                  multiGBK: "#{outdir}/Genbanks.gbk",
         
     | 
| 
      
 48 
     | 
    
         
            +
                  multiGBK: "#{@outdir}/Genbanks.gbk",
         
     | 
| 
       47 
49 
     | 
    
         
             
                  contigs: [],
         
     | 
| 
       48 
50 
     | 
    
         
             
                  contigs_length: [],
         
     | 
| 
       49 
     | 
    
         
            -
                  genes: "#{outdir}/Genes.fa",
         
     | 
| 
       50 
     | 
    
         
            -
                  proteins: "#{outdir}/Proteins.fa",
         
     | 
| 
      
 51 
     | 
    
         
            +
                  genes: "#{@outdir}/Genes.fa",
         
     | 
| 
      
 52 
     | 
    
         
            +
                  proteins: "#{@outdir}/Proteins.fa",
         
     | 
| 
       51 
53 
     | 
    
         
             
                  prot_ids_by_contig: {},
         
     | 
| 
       52 
     | 
    
         
            -
                  fasta_path: "#{outdir}/single-fasta/",
         
     | 
| 
       53 
     | 
    
         
            -
                  gbk_path: "#{outdir}/single-genbank/"
         
     | 
| 
      
 54 
     | 
    
         
            +
                  fasta_path: "#{@outdir}/single-fasta/",
         
     | 
| 
      
 55 
     | 
    
         
            +
                  gbk_path: "#{@outdir}/single-genbank/"
         
     | 
| 
       54 
56 
     | 
    
         
             
                }
         
     | 
| 
       55 
57 
     | 
    
         | 
| 
       56 
     | 
    
         
            -
                split_fasta 
     | 
| 
       57 
     | 
    
         
            -
                split_genbank 
     | 
| 
      
 58 
     | 
    
         
            +
                split_fasta
         
     | 
| 
      
 59 
     | 
    
         
            +
                split_genbank
         
     | 
| 
       58 
60 
     | 
    
         
             
                extract_cds_names
         
     | 
| 
       59 
61 
     | 
    
         
             
                @annotation_files
         
     | 
| 
       60 
62 
     | 
    
         | 
| 
         @@ -63,14 +65,14 @@ class SequenceFasta 
     | 
|
| 
       63 
65 
     | 
    
         | 
| 
       64 
66 
     | 
    
         
             
              # Split Multi Fasta file
         
     | 
| 
       65 
67 
     | 
    
         
             
              # RETURN : array of fasta files
         
     | 
| 
       66 
     | 
    
         
            -
              def split_fasta 
     | 
| 
      
 68 
     | 
    
         
            +
              def split_fasta
         
     | 
| 
       67 
69 
     | 
    
         
             
                @single_fasta = {}
         
     | 
| 
       68 
     | 
    
         
            -
                Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
         
     | 
| 
      
 70 
     | 
    
         
            +
                Dir.mkdir("#{@outdir}/single-fasta") if ! Dir.exists?("#{@outdir}/single-fasta")
         
     | 
| 
       69 
71 
     | 
    
         
             
                @fasta_flat.each_entry do |seq|
         
     | 
| 
       70 
72 
     | 
    
         
             
                  file_name = seq.definition.chomp.split(" ")[0]
         
     | 
| 
       71 
73 
     | 
    
         
             
                  @annotation_files[:contigs] << "#{file_name}"
         
     | 
| 
       72 
74 
     | 
    
         
             
                  @annotation_files[:contigs_length] << seq.seq.length
         
     | 
| 
       73 
     | 
    
         
            -
                  File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
         
     | 
| 
      
 75 
     | 
    
         
            +
                  File.open("#{@outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
         
     | 
| 
       74 
76 
     | 
    
         
             
                    fwrite.write(seq)
         
     | 
| 
       75 
77 
     | 
    
         
             
                  end
         
     | 
| 
       76 
78 
     | 
    
         
             
                  @single_fasta[file_name] = seq
         
     | 
| 
         @@ -80,9 +82,10 @@ class SequenceFasta 
     | 
|
| 
       80 
82 
     | 
    
         | 
| 
       81 
83 
     | 
    
         
             
              # Split Multi Genbanks file
         
     | 
| 
       82 
84 
     | 
    
         
             
              # RETURN : array of genbank files
         
     | 
| 
       83 
     | 
    
         
            -
              def split_genbank 
     | 
| 
      
 85 
     | 
    
         
            +
              def split_genbank
         
     | 
| 
       84 
86 
     | 
    
         | 
| 
       85 
     | 
    
         
            -
                 
     | 
| 
      
 87 
     | 
    
         
            +
                multigbk = "#{@outdir}/Genbanks.gbk"
         
     | 
| 
      
 88 
     | 
    
         
            +
                Dir.mkdir("#{@outdir}/single-genbank")if ! Dir.exists?("#{@outdir}/single-genbank")
         
     | 
| 
       86 
89 
     | 
    
         
             
                File.open(multigbk,"r") do |f|
         
     | 
| 
       87 
90 
     | 
    
         
             
                  fopen = nil
         
     | 
| 
       88 
91 
     | 
    
         
             
                  while l = f.gets
         
     | 
| 
         @@ -96,7 +99,7 @@ class SequenceFasta 
     | 
|
| 
       96 
99 
     | 
    
         
             
                      year = date.year
         
     | 
| 
       97 
100 
     | 
    
         
             
                      locus = "LOCUS       #{file_name}#{spacer}#{seq_length.to_s} bp    DNA     linear   BCT #{day}-#{month}-#{year}\n"
         
     | 
| 
       98 
101 
     | 
    
         
             
                      locus += "DEFINITION  #{file_name}\n"
         
     | 
| 
       99 
     | 
    
         
            -
                      fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
         
     | 
| 
      
 102 
     | 
    
         
            +
                      fopen = File.open("#{@outdir}/single-genbank/#{file_name}.gbk", "w")
         
     | 
| 
       100 
103 
     | 
    
         
             
                      fopen.write(locus)
         
     | 
| 
       101 
104 
     | 
    
         
             
                    elsif l[0..1] == "//"
         
     | 
| 
       102 
105 
     | 
    
         
             
                      fopen.write(outseq)
         
     |