RubyGems - bacterial-annotator - Versions diffs - 0.7.0 → 0.7.1 - Mend

bacterial-annotator 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/bin/bacterial-annotator +39 -29
data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
data/lib/bacterial-annotator.rb +201 -64
data/lib/bacterial-comparator.rb +42 -26
data/lib/bacterial-identificator.rb +86 -13
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0228aafd97af13b8756df42db362e4a53a30f4f0
-  data.tar.gz: 858942624597354dd0f52ad98ea1e94373289174
+  metadata.gz: 10f3d2469fb3aaf64e6b84076e05ab9e1ae41cd6
+  data.tar.gz: f08a5465ce584dd888074c7d0146c1450386598e
 SHA512:
-  metadata.gz: 82c36c4fba00b437e721991c739517b7cfeb5edaa7e1ac49849e59d3ffac2165f1ef39f9961aa756ff8ad691fec36a8b3424cf8ce4d0e1125d486fa2e2a38593
-  data.tar.gz: e8b569f61f2dcb7309c6587ce619f7432e2588a717f3017053adcb693327ac2f21850785883eae4477226d057dde18a308a88d85d2a0558561d567865d1348cc
+  metadata.gz: bd006cf021f0a74f1e98fa6367ca4aca0abb36004f375654ec552b68e1ac8ebc5c1f65e38a480473551848d25ac6be904c5d1841cc60657a47384169d368a18c
+  data.tar.gz: b5a8cb5c74c028e813bbc585e70b6dcb420b8c8f4ad659e8e4c985bce868009a7f5d6015c4e396768a6146c918943e4586a638c084d844cc91be6ac927c993b6

data/bin/bacterial-annotator CHANGED Viewed

@@ -63,27 +63,28 @@ def usage_annotate
 annotate [OPTIONS]
   // IO
-    --input/-i		<fasta_file>	Provide the fasta file to annotate
-    --outdir/-o		<outdir>	Output directory [default=BAnnotation]
-    --force/-f		Force to overwrite the output directory
-    --name/-n		<name> Sample name
+    --input/-i      <fasta_file>         Provide the fasta file to annotate
+    --outdir/-o     <outdir>             Output directory [default=BAnnotation]
+    --name/-n       <name>               Sample name
+    --force/-f                           Force to overwrite the output directory
   // MERGEM-based Annotation (Recommended)
-    --db/-d		<directory> MERGEM database directory
+    --db/-d         <species_dir>        From MERGEM database (include CDS and RNAs fasta)
+                                         // see bacteriapps.genome.ulaval.ca/mergem
   // Reference-Based Annotation
-    --refgenome/-g	<GBK_ID> 	Provide a Genbank file or a Gbk Accession ID.
-    --externaldb	<proteins fasta_file>
-			  Finish or do a complete annotation with this sequence database (a protein fasta file).
-			  Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
-			  >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
-			  >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
-    --pidentity		<% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.8]
-    --pcoverage		<% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.8]
-			             .. otherwise hint for a non-functional protein
-    --minlength		<length> Minimum contig length for annotation [default=500]
-    --meta		Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
+    --refgenome/-g  <GBK_ID>             Provide a Genbank file or a Gbk Accession ID.
+    --externaldb    <fasta_file>         Finish or do a complete annotation with this sequence database (protein fasta file).
+                                         Fasta headers need to look similar to NCBI or EBI fasta headers
+                                         EX: >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
+                                             >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
+  // Options
+    --pidentity     <% identity>          Minimum percentage identity to incorporate a CDS annotation [default=0.8]
+    --pcoverage     <% identity>          Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.8]
+                                          // otherwise hint for a non-functional protein
+    --minlength     <length>              Minimum contig length for annotation [default=500]
+    --meta                                Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
 OEM
@@ -101,6 +102,11 @@ def parseOptions_annotate
   options[:minlength] = 500
   options[:meta] = 0
+  if ARGV.length == 0
+    usage_annotate
+    abort
+  end
   while x = ARGV.shift
     case x.downcase
@@ -224,12 +230,14 @@ def usage_identify
 identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
-  //MERGEM Database
-    --db/-d        <database directory>
+  //Mash Sketch
+    --mash/-m      <mash sketch file>
   //IO
     --proc         <nb of process> Number of process to run the comparison
+    --output       [csv,tsv|json]
 OEM
 end
@@ -238,21 +246,24 @@ def parseOptions_identify
   options = {}
   options[:proc] = 2
-  options[:genomes_list] = []
+  options[:genome_list] = []
+  options[:output] = "tsv"
   while x = ARGV.shift
     case x.downcase
-    when "--db", "-d"
-      options[:database] = ARGV.shift
+    when "--mash", "-m"
+      options[:mash_file] = ARGV.shift
     when "--proc", "-p"
       options[:proc] = ARGV.shift
+    when "--output", "-o"
+      options[:output] = ARGV.shift
     when "--help", "-h"
       usage_identify
       abort
     else
       if File.exists? "#{x}"
-        options[:genomes_list] << x
+        options[:genome_list] << x
       else
         puts "#{x} file doesn't exist"
         usage_identify
@@ -302,14 +313,14 @@ if ARGV.size >= 1
     # Check Options
     if ! options.has_key? :refgenome and
-       ! options.has_key? :external_db
+       ! options.has_key? :external_db and
+       ! options.has_key? :mergem
       puts "You didn't provide a reference genome or a database for the annotation !"
     elsif ! options.has_key? :input
       puts "You didn't provide a fasta file to annotate !"
     end
     bannot = BacterialAnnotator.new(options, ROOT)
-    bannot.prepare_files_for_annotation
     bannot.run_annotation
   elsif ARGV[0] == "compare"
@@ -317,20 +328,19 @@ if ARGV.size >= 1
     ARGV.shift
     options = parseOptions_compare
     bcomp = BacterialComparator.new(options, ROOT)
-    aln_opt = options[:align].downcase
-    bcomp.mafft_aln aln_opt
-    bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
+    bcomp.run_comparison
   elsif ARGV[0] == "identify"
     ARGV.shift
     options = parseOptions_identify
-    if options[:genomes_list].empty?
+    if options[:genome_list].empty?
       puts "You need at least 1 genome fasta to identify !!"
       usage_identify
       abort
     end
     bident = BacterialIdentificator.new(options, ROOT)
+    bident.run_identification
   elsif ARGV[0] == "--version" or ARGV[0] == "-v"

data/lib/bacterial-annotator/sequence-annotation.rb CHANGED Viewed

@@ -5,27 +5,208 @@
 # version: 	0.0.1
 # licence:
+require 'json'
+require 'zlib'
 class SequenceAnnotation
-  attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
+  attr_accessor :gbk, :coding_seq, :rna_seq, :cds_file, :rna_file
   # Initialize then genbank file
-  def initialize gbk_file, outdir
+  def initialize root, outdir, file_ref, type
+    @root = root
+    @outdir = outdir
+    @coding_seq = {}
+    @rna_seq = {}
+    case type
+    when "refGbk"
+      # reference genome use for annotation
+      reference_gbk file_ref
+    when "db"
+      # reference database use for annotation
+      reference_db file_ref
+    when "fasta"
+      # single fasta database for annotation (completion)
+      single_fasta file_ref
+    when "newGbk"
+      # new genbank holder to be annotated
+      new_gbk file_ref
+    end
+  end
+  # Use a MERGEM database to get annotation from it
+  def reference_db dir
+    abort "Aborting: Can't find MERGEM db direcotry" if ! File.exists? dir
+    @cds_file = "#{dir}/cds.dmnd"
+    @rna_file = "#{dir}/rnas.fasta"
+    json_genes = {}
+    Zlib::GzipReader.open("#{dir}/cds.json.gz") {|gz|
+      json_genes = JSON.parse(gz.read)
+    }
+    json_genes.each do |gene|
-    @gbk_file = gbk_file
-    if ! File.exists? @gbk_file
-      fetch_ncbi_genome(@gbk_file, outdir)
-      @gbk_file = "#{outdir}/#{gbk_file}.gbk"
-      # @gbk_file += ".gbk"
+      prot_id = gene["cluster_id"]
+      @coding_seq[prot_id] = {
+        protId: prot_id,
+        location: nil,
+        product: gene["consensus_name"],
+        length: gene["consensus_length"]
+      }
+    end
+    # File.open("#{dir}/cds.txt") do |f|
+    #   while l = f.gets
+    #     lA = l.chomp.split(" ")
+    #     @coding_seq[lA[0].gsub(">","")] = {
+    #       protId: lA[0].gsub(">",""),
+    #       location: nil,
+    #       product: lA[1..-1].join(' '),
+    #     }
+    #   end
+    # end
+    File.open("#{dir}/rnas.txt") do |f|
+      while l = f.gets
+        lA = l.chomp.split(" ")
+        @rna_seq[lA[0].gsub(">","")] = {
+          protId: lA[0].gsub(">",""),
+          location: nil,
+          product: lA[1..-1].join(' '),
+        }
+      end
     end
-    flat_gbk = Bio::FlatFile.auto(@gbk_file)
+  end
+  # Use a Genbank Reference and read annotation from it
+  def reference_gbk gbk_file
+    puts "# Preparing reference genome files.."
+    if ! File.exists? gbk_file
+      fetch_ncbi_genome(gbk_file)
+      gbk_file = "#{@outdir}/#{gbk_file}.gbk"
+      # gbk_file += ".gbk"
+    end
+    flat_gbk = Bio::FlatFile.auto(gbk_file)
     # Check if gbk is valid
     if flat_gbk.dbclass != Bio::GenBank
-      abort "Aborting : The input #{@gbk_file} is not a valid genbank file !"
+      abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
+    else
+      @gbk = flat_gbk.next_entry
+    end
+    @bioseq = @gbk.to_biosequence
+    write_cds_to_file
+    write_rna_to_file
+  end
+  # Use a Genbank Reference and read annotation from it
+  def single_fasta fasta_file
+    return "" if ! File.exists? fasta_file
+    File.open(fasta_file, "r") do |dbfile|
+      while l=dbfile.gets
+        if l[0] == ">"
+          lA = l.chomp.split("|")
+          if lA.length > 1      # refseq, ncbi, trembl, swissprot
+            key_gi = l.split(" ")[0][1..-1]
+            product_long = lA[-1]
+            organism = ""
+            product = ""
+            db_source = "[DBSource]"
+            if product_long.scan(/|/).count >= 5 # FROM BIORUBY SCRIPTS
+              product = product_long
+              db_source = "RefSeq"
+            elsif product_long.include? " [" and product_long.include? "]" # NCBI
+              organism = product_long[/\[.*?\]/]
+              product = product_long.split(" [")[0].strip
+            elsif product_long.include? "OS=" # Swissprot / TrEMBL
+              product_tmp = product.split("OS=")
+              organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
+              product = product_tmp[0].strip
+            elsif product_long.include? "[A-Z][A-Z]=" # NCBI
+              product = product_long.split(/[A-Z][A-Z]=/)[0].strip
+            else
+              product = product_long
+            end
+            org = organism.gsub("[","").gsub("]","")
+            product.lstrip!
+            prot_id = nil
+            if key_gi.count("|") == 4
+              if lA[2] == "ref"
+                db_source = "RefSeq"
+              end
+              prot_id = lA[3]
+            elsif key_gi.count("|") == 2
+              if lA[0].include? == "sp" or
+                lA[0].include? == "tr"
+                db_source = "UniProtKB"
+              end
+              prot_id = lA[1]
+            elsif key_gi.count("|") == 5
+              db_source = "RefSeq"
+              prot_id = lA[2]
+            end
+          else                  # mergem
+          end
+          @coding_seq[key_gi] = { product: product,
+                                  org: org,
+                                  prot_id: prot_id,
+                                  db_source: db_source }
+        end
+      end
+    end
+  end
+  # New Genbank Holder to add annotation to it
+  def new_gbk gbk_file
+    if ! File.exists? gbk_file
+      fetch_ncbi_genome(gbk_file)
+      gbk_file = "#{@outdir}/#{gbk_file}.gbk"
+      # gbk_file += ".gbk"
+    end
+    flat_gbk = Bio::FlatFile.auto(gbk_file)
+    # Check if gbk is valid
+    if flat_gbk.dbclass != Bio::GenBank
+      abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
     else
       @gbk = flat_gbk.next_entry
     end
@@ -38,9 +219,7 @@ class SequenceAnnotation
   # Prepare CDS/proteins
   def get_cds
-    if @coding_seq == nil
-      @coding_seq = {}
+    if @coding_seq.empty?
       # Iterate over each CDS
       @gbk.each_cds do |ft|
@@ -74,7 +253,7 @@ class SequenceAnnotation
           product: product[0],
           bioseq: pepBioSeq,
           bioseq_gene: dnaBioSeq,
-          bioseq_len: pepBioSeq.length
+          length: pepBioSeq.length
         }
       end
@@ -88,12 +267,12 @@ class SequenceAnnotation
   # Prepare rRNA tRNA
   def get_rna
-    if @rna_seq == nil
+    if @rna_seq.empty?
       @rna_seq = {}
       @gbk.features do |ft|
-        next if ! ft.feature.to_s.include? "RNA"
+        next if ! ft.feature.to_s.include? "rRNA"
         ftH = ft.to_hash
         loc = ft.locations
@@ -129,20 +308,19 @@ class SequenceAnnotation
   end
   # Print CDS to files
   # RETURN : cds_file path
-  def write_cds_to_file outdir
+  def write_cds_to_file
     cds_file = "#{@gbk.accession}.pep"
     dna_file = "#{@gbk.accession}.dna"
-    if @coding_seq == nil
+    if @coding_seq.empty?
       get_cds
     end
-    dna_out = File.open("#{outdir}/#{dna_file}", "w")
-    File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
+    dna_out = File.open("#{@outdir}/#{dna_file}", "w")
+    File.open("#{@outdir}/#{cds_file}", "w") do |fwrite|
       @coding_seq.each_key do |k|
         seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
         seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
@@ -152,28 +330,28 @@ class SequenceAnnotation
     end
     dna_out.close
-    @cds_file = "#{outdir}/" + cds_file
+    @cds_file = "#{@outdir}/" + cds_file
   end
   # Print RNA to files
   # RETURN : rna_file path
-  def write_rna_to_file outdir
+  def write_rna_to_file
     rna_file = "#{@gbk.accession}.rna"
-    if @rna_seq == nil
+    if @rna_seq.empty?
       get_rna
     end
-    File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
+    File.open("#{@outdir}/#{rna_file}", "w") do |fwrite|
       @rna_seq.each_key do |k|
         seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
         fwrite.write(seqout_dna)
       end
     end
-    @rna_file = "#{outdir}/" + rna_file
+    @rna_file = "#{@outdir}/" + rna_file
   end
@@ -247,6 +425,7 @@ class SequenceAnnotation
           # check if there is a reference genome.. reference_locus shouldn't be nil in that case
           if locus != nil
             qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
             ftArray.push(qNote)
@@ -390,9 +569,9 @@ class SequenceAnnotation
   end
-  def save_genbank_to_file outdir
+  def save_genbank_to_file
-    File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
+    File.open("#{@outdir}/#{@gbk.definition}.gbk", "w") do |f|
       f.write(@gbk.to_biosequence.output(:genbank))
     end
@@ -403,7 +582,7 @@ class SequenceAnnotation
   ###################
   # Fct: Get dna sequence
-  def get_DNA (cds, seq)
+  def get_DNA cds, seq
     loc = cds.locations
     sbeg = loc[0].from.to_i
     send = loc[0].to.to_i
@@ -418,11 +597,11 @@ class SequenceAnnotation
   # Fetch genbank genome from NCBI
-  def fetch_ncbi_genome refgenome_id, outdir
+  def fetch_ncbi_genome refgenome_id
     Bio::NCBI.default_email = 'default@default.com'
     ncbi = Bio::NCBI::REST.new
     genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
-    File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
+    File.open("#{@outdir}/#{refgenome_id}.gbk", "w") do |f|
       f.write(genbankstring)
     end
   end

data/lib/bacterial-annotator/sequence-fasta.rb CHANGED Viewed

@@ -13,8 +13,10 @@ class SequenceFasta
   attr_reader :fasta_flat, :fasta_file, :annotation_files
   # Initialize fasta holder
-  def initialize fasta_file, meta
+  def initialize root, outdir, fasta_file, meta
+    @root = root
+    @outdir = outdir
     @fasta_file = fasta_file
     @fasta_flat = Bio::FlatFile.auto(@fasta_file)
@@ -32,29 +34,29 @@ class SequenceFasta
   # Run prodigal on the genome to annotate
-  def run_prodigal root, outdir
+  def run_prodigal
     @annotation_files = {}
-    Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
+    Dir.mkdir "#{@outdir}" if ! Dir.exists? "#{@outdir}"
     if @meta==1
-      system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
+      system("#{@root}/prodigal.linux -p meta -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
     else
-      system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
+      system("#{@root}/prodigal.linux -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
     end
     @annotation_files = {
-      multiGBK: "#{outdir}/Genbanks.gbk",
+      multiGBK: "#{@outdir}/Genbanks.gbk",
       contigs: [],
       contigs_length: [],
-      genes: "#{outdir}/Genes.fa",
-      proteins: "#{outdir}/Proteins.fa",
+      genes: "#{@outdir}/Genes.fa",
+      proteins: "#{@outdir}/Proteins.fa",
       prot_ids_by_contig: {},
-      fasta_path: "#{outdir}/single-fasta/",
-      gbk_path: "#{outdir}/single-genbank/"
+      fasta_path: "#{@outdir}/single-fasta/",
+      gbk_path: "#{@outdir}/single-genbank/"
     }
-    split_fasta outdir
-    split_genbank outdir, "#{outdir}/Genbanks.gbk"
+    split_fasta
+    split_genbank
     extract_cds_names
     @annotation_files
@@ -63,14 +65,14 @@ class SequenceFasta
   # Split Multi Fasta file
   # RETURN : array of fasta files
-  def split_fasta outdir
+  def split_fasta
     @single_fasta = {}
-    Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
+    Dir.mkdir("#{@outdir}/single-fasta") if ! Dir.exists?("#{@outdir}/single-fasta")
     @fasta_flat.each_entry do |seq|
       file_name = seq.definition.chomp.split(" ")[0]
       @annotation_files[:contigs] << "#{file_name}"
       @annotation_files[:contigs_length] << seq.seq.length
-      File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
+      File.open("#{@outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
         fwrite.write(seq)
       end
       @single_fasta[file_name] = seq
@@ -80,9 +82,10 @@ class SequenceFasta
   # Split Multi Genbanks file
   # RETURN : array of genbank files
-  def split_genbank outdir, multigbk
+  def split_genbank
-    Dir.mkdir("#{outdir}/single-genbank")if ! Dir.exists?("#{outdir}/single-genbank")
+    multigbk = "#{@outdir}/Genbanks.gbk"
+    Dir.mkdir("#{@outdir}/single-genbank")if ! Dir.exists?("#{@outdir}/single-genbank")
     File.open(multigbk,"r") do |f|
       fopen = nil
       while l = f.gets
@@ -96,7 +99,7 @@ class SequenceFasta
           year = date.year
           locus = "LOCUS       #{file_name}#{spacer}#{seq_length.to_s} bp    DNA     linear   BCT #{day}-#{month}-#{year}\n"
           locus += "DEFINITION  #{file_name}\n"
-          fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
+          fopen = File.open("#{@outdir}/single-genbank/#{file_name}.gbk", "w")
           fopen.write(locus)
         elsif l[0..1] == "//"
           fopen.write(outseq)