RubyGems - mapp2g - Versions diffs - 0.1.4 → 0.2.0 - Mend

mapp2g 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
-  data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
+  metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
+  data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
 SHA512:
-  metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
-  data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2
+  metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
+  data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79

data/README.md CHANGED Viewed

@@ -26,6 +26,9 @@ Usage: mapp2g [options]
     -h, --help                       show this help message and exit
 ```
+Query sequences should be in FASTA format. Multiple sequences can be included in one file.
 (example)
 ```
 mapp2g -q human_genome.fasta -q p53.protein.fasta
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
 makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
 ```
+## Outputs
+For each query, the following files are generated.
+- query sequence in fasta
+- blast output in tab-delmited format (format 6)
+- exonerate full output
+- exonerate alignment in gff3 format
+- report.json
+report.json contains all of the information above in json line format.
 ## License

data/exe/mapp2g CHANGED Viewed

@@ -1,9 +1,11 @@
 #!/usr/bin/env ruby
-require "mapp2g"
+#require_relative '../lib/mapp2g' # for development
+require 'mapp2g'
 require 'bio'
 require 'tempfile'
 require 'optparse'
+require 'json'
 ### Parse options
@@ -62,6 +64,9 @@ rescue => e
 end
 begin
+  unless File.exist?(genome)
+    raise "genome file (#{genome}) not found"
+  end
   unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
     raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
   end
@@ -72,6 +77,8 @@ end
 ### Main
+report_json_lines = []
 Dir.mkdir(outdir)
 Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
@@ -80,11 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
   tf.close
   id = (i + 1).to_s
   query_file_path = "#{outdir}/#{id}.fasta"
-  out_file_path = "#{outdir}/#{id}.exonerate.txt"
   File.open(query_file_path, "w"){|o| o.puts fas}
   mapper = Mapp2g::Mapper.new()
   res = mapper.run(query_file_path, genome)
-  File.open(out_file_path, "w"){|o| o.puts res}
+  if res
+    out_file_path = "#{outdir}/#{id}.exonerate.txt"
+    File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
+    out_file_path = "#{outdir}/#{id}.blast.txt"
+    File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
+    gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
+    out_file_path = "#{outdir}/#{id}.exonerate.gff3"
+    File.open(out_file_path, "w"){|o| o.puts gff3}
+    report = {
+      "runtime_id" => id,
+      "query_id" => fas.entry_id,
+      "query_fasta" => fas.to_s,
+      "exonerate" => res[:exonerate_result],
+      "blast" => res[:blast_result],
+      "gff3" => gff3
+    }
+    report_json_lines << report.to_json
+  else
+    report = {
+      "runtime_id" => id,
+      "query_id" => fas.entry_id,
+      "query_fasta" => fas.to_s,
+      "exonerate" => nil,
+      "blast" => nil,
+      "gff3" => nil
+    }
+    STDERR.puts "No hit for #{fas.entry_id}"
+  end
 end
+report_json = report_json_lines.join("\n")
+File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}

data/lib/mapp2g/mapper.rb CHANGED Viewed

@@ -4,9 +4,9 @@ module Mapp2g
   class Mapper
-    EVALUE_DEFAULT = 1.0e-8
+    EVALUE_DEFAULT = 1.0e-5
     NCPU_DEFAULT = 4
-    MAX_HSP_INTERVAL = 50000
+    MAX_HSP_INTERVAL = 400000
     EXTENSION = 50000
     TMPDIR_DEFAULT = Dir.tmpdir
@@ -15,11 +15,11 @@ module Mapp2g
     ## step 1: tblastn
     def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
-      cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
+      cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
       # puts cmd
       res = nil
-      IO.popen(cmd){|io| res = io.read}
-      # puts res
+      IO.popen(cmd){|io| res = io.read}
+#     STDERR.puts res
       if res == ""
       ## no hit
         return nil
@@ -27,7 +27,7 @@ module Mapp2g
         lines = []
         prev_chr = nil
         res.split(/\n/).each do |l|
-      #    p prev_chr
+#          p prev_chr
           a = l.chomp.split(/\t/)
           unless prev_chr
             lines << l
@@ -42,7 +42,7 @@ module Mapp2g
         a = lines.shift.chomp.split(/\t/)
         left, right = [a[8].to_i, a[9].to_i].sort
-        STDERR.puts [left, right].inspect
+#        STDERR.puts [left, right].inspect
         lines.each do |l|
           a = l.chomp.split(/\t/)
@@ -57,14 +57,16 @@ module Mapp2g
             break
           end
         end
-      #  p [left, right]
+#        STDERR.puts [left, right].inspect
         top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
         h = {
           :top_chromosome => top_chromosome,
           :left => left,
-          :right => right
+          :right => right,
+          :blast_result => res
         }
+#        STDERR.puts h.inspect
         return h
       end
     end
@@ -117,8 +119,10 @@ module Mapp2g
         tf.close
         exonerate_result = exec_exonerate(query, tf.path)
-        return exonerate_result
+        return {
+          :blast_result => hit[:blast_result],
+          :exonerate_result => exonerate_result
+        }
       else
         return nil
       end

data/lib/mapp2g/report.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module Mapp2g
+  class ExonerateOutput
+    def self.load(file)
+      self.new(File.read(file))
+    end
+    # @param exonerate_out [String] exonerate output text, not file path
+    def initialize(exonerate_out)
+      @exonerate_out = exonerate_out
+      @query_name = nil
+      @target = nil
+      @cigar = nil
+      @gff2_lines = []
+      #vulgar = nil
+      parse()
+    end
+    attr_reader :query_name, :target, :cigar, :gff2_lines
+    def parse(opt={})
+      @exonerate_out.each_line do |l|
+        if m = /\s+Query:\s/.match(l)
+          @query_name = m.post_match.chomp.split[0]
+        elsif m = /\s+Target:\s/.match(l)
+          @target = m.post_match.split[0]
+        elsif m = /^cigar:\s/.match(l)
+          @cigar = m.post_match.chomp
+        elsif /^#{@target}/ =~ l &&
+            (/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
+            (/\texon\t/.match(l) || /\tgene\t/.match(l))
+          @gff2_lines << l.chomp
+        end
+      end
+    end
+    def to_gff3(opt={})
+      gff3_lines = []
+      @gff2_lines.each do |l|
+        a = l.chomp.split(/\t/)
+        b = Array.new(9)
+        a.each_with_index{|x, i| b[i] = x}
+        if b[2] == "gene"
+          b[2] = "match"
+          orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
+      #    p orig_attribute
+          c = @cigar.split(/\s+/)
+          cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
+          attribute = {'ID' => @query_name,
+                       'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
+                       'Gap'=> cigar_pairs.join(" "),
+                       'identity' => orig_attribute['identity'],
+                       'similarity' => orig_attribute['similarity']}
+          b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
+        elsif b[2] == "exon"
+          b[2] = "match_part"
+          orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
+          attribute = {'Parent' => @query_name,
+          'identity' => orig_attribute['identity'],
+          'similarity' => orig_attribute['similarity']}
+          b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
+        else
+          raise
+        end
+        gff3_lines << b.join("\t")
+      end
+      return gff3_lines.join("\n")
+    end
+  end
+end

data/lib/mapp2g/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Mapp2g
-  VERSION = "0.1.4"
+  VERSION = "0.2.0"
 end

data/lib/mapp2g.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative "mapp2g/mapper"
+require_relative "mapp2g/report"
 require_relative "mapp2g/version"
 module Mapp2g

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: mapp2g
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.0
 platform: ruby
 authors:
 - Shuji Shigenobu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-07-09 00:00:00.000000000 Z
+date: 2023-07-17 00:00:00.000000000 Z
 dependencies: []
 description: mapp2g is a bioinformatics software, which map and align protein sequences
   (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
@@ -28,6 +28,7 @@ files:
 - exe/mapp2g
 - lib/mapp2g.rb
 - lib/mapp2g/mapper.rb
+- lib/mapp2g/report.rb
 - lib/mapp2g/version.rb
 - mapp2g.gemspec
 - scripts/add_annotation_from_uniprot_fasta_to_gff.rb
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.10
+rubygems_version: 3.4.15
 signing_key:
 specification_version: 4
 summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware