mapp2g 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
4
- data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
3
+ metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
4
+ data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
5
5
  SHA512:
6
- metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
7
- data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2
6
+ metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
7
+ data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
data/README.md CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
26
26
  -h, --help show this help message and exit
27
27
  ```
28
28
 
29
+ Query sequences should be in FASTA format. Multiple sequences can be included in one file.
30
+
31
+
29
32
  (example)
30
33
  ```
31
34
  mapp2g -q human_genome.fasta -q p53.protein.fasta
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
37
40
  makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
38
41
  ```
39
42
 
43
+ ## Outputs
44
+
45
+ For each query, the following files are generated.
46
+
47
+ - query sequence in fasta
48
+ - blast output in tab-delmited format (format 6)
49
+ - exonerate full output
50
+ - exonerate alignment in gff3 format
51
+ - report.json
52
+
53
+ report.json contains all of the information above in json line format.
54
+
40
55
 
41
56
  ## License
42
57
 
data/exe/mapp2g CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "mapp2g"
3
+ #require_relative '../lib/mapp2g' # for development
4
+ require 'mapp2g'
4
5
  require 'bio'
5
6
  require 'tempfile'
6
7
  require 'optparse'
8
+ require 'json'
7
9
 
8
10
  ### Parse options
9
11
 
@@ -62,6 +64,9 @@ rescue => e
62
64
  end
63
65
 
64
66
  begin
67
+ unless File.exist?(genome)
68
+ raise "genome file (#{genome}) not found"
69
+ end
65
70
  unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
66
71
  raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
67
72
  end
@@ -72,6 +77,8 @@ end
72
77
 
73
78
  ### Main
74
79
 
80
+ report_json_lines = []
81
+
75
82
  Dir.mkdir(outdir)
76
83
 
77
84
  Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
@@ -80,11 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
80
87
  tf.close
81
88
  id = (i + 1).to_s
82
89
  query_file_path = "#{outdir}/#{id}.fasta"
83
- out_file_path = "#{outdir}/#{id}.exonerate.txt"
84
90
  File.open(query_file_path, "w"){|o| o.puts fas}
85
91
 
86
92
  mapper = Mapp2g::Mapper.new()
87
93
  res = mapper.run(query_file_path, genome)
88
- File.open(out_file_path, "w"){|o| o.puts res}
94
+ if res
95
+ out_file_path = "#{outdir}/#{id}.exonerate.txt"
96
+ File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
97
+ out_file_path = "#{outdir}/#{id}.blast.txt"
98
+ File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
99
+ gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
100
+ out_file_path = "#{outdir}/#{id}.exonerate.gff3"
101
+ File.open(out_file_path, "w"){|o| o.puts gff3}
102
+
103
+ report = {
104
+ "runtime_id" => id,
105
+ "query_id" => fas.entry_id,
106
+ "query_fasta" => fas.to_s,
107
+ "exonerate" => res[:exonerate_result],
108
+ "blast" => res[:blast_result],
109
+ "gff3" => gff3
110
+ }
111
+ report_json_lines << report.to_json
112
+
113
+ else
114
+ report = {
115
+ "runtime_id" => id,
116
+ "query_id" => fas.entry_id,
117
+ "query_fasta" => fas.to_s,
118
+ "exonerate" => nil,
119
+ "blast" => nil,
120
+ "gff3" => nil
121
+ }
122
+ STDERR.puts "No hit for #{fas.entry_id}"
123
+ end
89
124
 
90
125
  end
126
+
127
+ report_json = report_json_lines.join("\n")
128
+ File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
data/lib/mapp2g/mapper.rb CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
4
4
 
5
5
  class Mapper
6
6
 
7
- EVALUE_DEFAULT = 1.0e-8
7
+ EVALUE_DEFAULT = 1.0e-5
8
8
  NCPU_DEFAULT = 4
9
- MAX_HSP_INTERVAL = 50000
9
+ MAX_HSP_INTERVAL = 400000
10
10
  EXTENSION = 50000
11
11
  TMPDIR_DEFAULT = Dir.tmpdir
12
12
 
@@ -15,11 +15,11 @@ module Mapp2g
15
15
 
16
16
  ## step 1: tblastn
17
17
  def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
18
- cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
18
+ cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
19
19
  # puts cmd
20
20
  res = nil
21
- IO.popen(cmd){|io| res = io.read}
22
- # puts res
21
+ IO.popen(cmd){|io| res = io.read}
22
+ # STDERR.puts res
23
23
  if res == ""
24
24
  ## no hit
25
25
  return nil
@@ -27,7 +27,7 @@ module Mapp2g
27
27
  lines = []
28
28
  prev_chr = nil
29
29
  res.split(/\n/).each do |l|
30
- # p prev_chr
30
+ # p prev_chr
31
31
  a = l.chomp.split(/\t/)
32
32
  unless prev_chr
33
33
  lines << l
@@ -42,7 +42,7 @@ module Mapp2g
42
42
  a = lines.shift.chomp.split(/\t/)
43
43
  left, right = [a[8].to_i, a[9].to_i].sort
44
44
 
45
- STDERR.puts [left, right].inspect
45
+ # STDERR.puts [left, right].inspect
46
46
 
47
47
  lines.each do |l|
48
48
  a = l.chomp.split(/\t/)
@@ -57,14 +57,16 @@ module Mapp2g
57
57
  break
58
58
  end
59
59
  end
60
- # p [left, right]
60
+ # STDERR.puts [left, right].inspect
61
61
 
62
62
  top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
63
63
  h = {
64
64
  :top_chromosome => top_chromosome,
65
65
  :left => left,
66
- :right => right
66
+ :right => right,
67
+ :blast_result => res
67
68
  }
69
+ # STDERR.puts h.inspect
68
70
  return h
69
71
  end
70
72
  end
@@ -117,8 +119,10 @@ module Mapp2g
117
119
  tf.close
118
120
 
119
121
  exonerate_result = exec_exonerate(query, tf.path)
120
- return exonerate_result
121
-
122
+ return {
123
+ :blast_result => hit[:blast_result],
124
+ :exonerate_result => exonerate_result
125
+ }
122
126
  else
123
127
  return nil
124
128
  end
@@ -0,0 +1,75 @@
1
+ module Mapp2g
2
+
3
+ class ExonerateOutput
4
+
5
+ def self.load(file)
6
+ self.new(File.read(file))
7
+ end
8
+
9
+ # @param exonerate_out [String] exonerate output text, not file path
10
+ def initialize(exonerate_out)
11
+ @exonerate_out = exonerate_out
12
+ @query_name = nil
13
+ @target = nil
14
+ @cigar = nil
15
+ @gff2_lines = []
16
+ #vulgar = nil
17
+ parse()
18
+ end
19
+
20
+ attr_reader :query_name, :target, :cigar, :gff2_lines
21
+
22
+ def parse(opt={})
23
+ @exonerate_out.each_line do |l|
24
+ if m = /\s+Query:\s/.match(l)
25
+ @query_name = m.post_match.chomp.split[0]
26
+ elsif m = /\s+Target:\s/.match(l)
27
+ @target = m.post_match.split[0]
28
+ elsif m = /^cigar:\s/.match(l)
29
+ @cigar = m.post_match.chomp
30
+ elsif /^#{@target}/ =~ l &&
31
+ (/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
32
+ (/\texon\t/.match(l) || /\tgene\t/.match(l))
33
+ @gff2_lines << l.chomp
34
+ end
35
+ end
36
+ end
37
+
38
+ def to_gff3(opt={})
39
+ gff3_lines = []
40
+ @gff2_lines.each do |l|
41
+ a = l.chomp.split(/\t/)
42
+ b = Array.new(9)
43
+ a.each_with_index{|x, i| b[i] = x}
44
+ if b[2] == "gene"
45
+ b[2] = "match"
46
+ orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
47
+ # p orig_attribute
48
+ c = @cigar.split(/\s+/)
49
+ cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
50
+ attribute = {'ID' => @query_name,
51
+ 'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
52
+ 'Gap'=> cigar_pairs.join(" "),
53
+ 'identity' => orig_attribute['identity'],
54
+ 'similarity' => orig_attribute['similarity']}
55
+ b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
56
+ elsif b[2] == "exon"
57
+ b[2] = "match_part"
58
+ orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
59
+ attribute = {'Parent' => @query_name,
60
+ 'identity' => orig_attribute['identity'],
61
+ 'similarity' => orig_attribute['similarity']}
62
+ b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
63
+
64
+ else
65
+ raise
66
+ end
67
+ gff3_lines << b.join("\t")
68
+ end
69
+ return gff3_lines.join("\n")
70
+ end
71
+
72
+ end
73
+
74
+ end
75
+
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Mapp2g
4
- VERSION = "0.1.4"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/mapp2g.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "mapp2g/mapper"
4
+ require_relative "mapp2g/report"
4
5
  require_relative "mapp2g/version"
5
6
 
6
7
  module Mapp2g
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mapp2g
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-09 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: mapp2g is a bioinformatics software, which map and align protein sequences
14
14
  (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
@@ -28,6 +28,7 @@ files:
28
28
  - exe/mapp2g
29
29
  - lib/mapp2g.rb
30
30
  - lib/mapp2g/mapper.rb
31
+ - lib/mapp2g/report.rb
31
32
  - lib/mapp2g/version.rb
32
33
  - mapp2g.gemspec
33
34
  - scripts/add_annotation_from_uniprot_fasta_to_gff.rb
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
58
  - !ruby/object:Gem::Version
58
59
  version: '0'
59
60
  requirements: []
60
- rubygems_version: 3.4.10
61
+ rubygems_version: 3.4.15
61
62
  signing_key:
62
63
  specification_version: 4
63
64
  summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware