mapp2g 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
4
- data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
3
+ metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
4
+ data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
5
5
  SHA512:
6
- metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
7
- data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2
6
+ metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
7
+ data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
data/README.md CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
26
26
  -h, --help show this help message and exit
27
27
  ```
28
28
 
29
+ Query sequences should be in FASTA format. Multiple sequences can be included in one file.
30
+
31
+
29
32
  (example)
30
33
  ```
31
34
  mapp2g -q human_genome.fasta -q p53.protein.fasta
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
37
40
  makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
38
41
  ```
39
42
 
43
+ ## Outputs
44
+
45
+ For each query, the following files are generated.
46
+
47
+ - query sequence in fasta
48
+ - blast output in tab-delmited format (format 6)
49
+ - exonerate full output
50
+ - exonerate alignment in gff3 format
51
+ - report.json
52
+
53
+ report.json contains all of the information above in json line format.
54
+
40
55
 
41
56
  ## License
42
57
 
data/exe/mapp2g CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "mapp2g"
3
+ #require_relative '../lib/mapp2g' # for development
4
+ require 'mapp2g'
4
5
  require 'bio'
5
6
  require 'tempfile'
6
7
  require 'optparse'
8
+ require 'json'
7
9
 
8
10
  ### Parse options
9
11
 
@@ -62,6 +64,9 @@ rescue => e
62
64
  end
63
65
 
64
66
  begin
67
+ unless File.exist?(genome)
68
+ raise "genome file (#{genome}) not found"
69
+ end
65
70
  unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
66
71
  raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
67
72
  end
@@ -72,6 +77,8 @@ end
72
77
 
73
78
  ### Main
74
79
 
80
+ report_json_lines = []
81
+
75
82
  Dir.mkdir(outdir)
76
83
 
77
84
  Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
@@ -80,11 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
80
87
  tf.close
81
88
  id = (i + 1).to_s
82
89
  query_file_path = "#{outdir}/#{id}.fasta"
83
- out_file_path = "#{outdir}/#{id}.exonerate.txt"
84
90
  File.open(query_file_path, "w"){|o| o.puts fas}
85
91
 
86
92
  mapper = Mapp2g::Mapper.new()
87
93
  res = mapper.run(query_file_path, genome)
88
- File.open(out_file_path, "w"){|o| o.puts res}
94
+ if res
95
+ out_file_path = "#{outdir}/#{id}.exonerate.txt"
96
+ File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
97
+ out_file_path = "#{outdir}/#{id}.blast.txt"
98
+ File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
99
+ gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
100
+ out_file_path = "#{outdir}/#{id}.exonerate.gff3"
101
+ File.open(out_file_path, "w"){|o| o.puts gff3}
102
+
103
+ report = {
104
+ "runtime_id" => id,
105
+ "query_id" => fas.entry_id,
106
+ "query_fasta" => fas.to_s,
107
+ "exonerate" => res[:exonerate_result],
108
+ "blast" => res[:blast_result],
109
+ "gff3" => gff3
110
+ }
111
+ report_json_lines << report.to_json
112
+
113
+ else
114
+ report = {
115
+ "runtime_id" => id,
116
+ "query_id" => fas.entry_id,
117
+ "query_fasta" => fas.to_s,
118
+ "exonerate" => nil,
119
+ "blast" => nil,
120
+ "gff3" => nil
121
+ }
122
+ STDERR.puts "No hit for #{fas.entry_id}"
123
+ end
89
124
 
90
125
  end
126
+
127
+ report_json = report_json_lines.join("\n")
128
+ File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
data/lib/mapp2g/mapper.rb CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
4
4
 
5
5
  class Mapper
6
6
 
7
- EVALUE_DEFAULT = 1.0e-8
7
+ EVALUE_DEFAULT = 1.0e-5
8
8
  NCPU_DEFAULT = 4
9
- MAX_HSP_INTERVAL = 50000
9
+ MAX_HSP_INTERVAL = 400000
10
10
  EXTENSION = 50000
11
11
  TMPDIR_DEFAULT = Dir.tmpdir
12
12
 
@@ -15,11 +15,11 @@ module Mapp2g
15
15
 
16
16
  ## step 1: tblastn
17
17
  def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
18
- cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
18
+ cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
19
19
  # puts cmd
20
20
  res = nil
21
- IO.popen(cmd){|io| res = io.read}
22
- # puts res
21
+ IO.popen(cmd){|io| res = io.read}
22
+ # STDERR.puts res
23
23
  if res == ""
24
24
  ## no hit
25
25
  return nil
@@ -27,7 +27,7 @@ module Mapp2g
27
27
  lines = []
28
28
  prev_chr = nil
29
29
  res.split(/\n/).each do |l|
30
- # p prev_chr
30
+ # p prev_chr
31
31
  a = l.chomp.split(/\t/)
32
32
  unless prev_chr
33
33
  lines << l
@@ -42,7 +42,7 @@ module Mapp2g
42
42
  a = lines.shift.chomp.split(/\t/)
43
43
  left, right = [a[8].to_i, a[9].to_i].sort
44
44
 
45
- STDERR.puts [left, right].inspect
45
+ # STDERR.puts [left, right].inspect
46
46
 
47
47
  lines.each do |l|
48
48
  a = l.chomp.split(/\t/)
@@ -57,14 +57,16 @@ module Mapp2g
57
57
  break
58
58
  end
59
59
  end
60
- # p [left, right]
60
+ # STDERR.puts [left, right].inspect
61
61
 
62
62
  top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
63
63
  h = {
64
64
  :top_chromosome => top_chromosome,
65
65
  :left => left,
66
- :right => right
66
+ :right => right,
67
+ :blast_result => res
67
68
  }
69
+ # STDERR.puts h.inspect
68
70
  return h
69
71
  end
70
72
  end
@@ -117,8 +119,10 @@ module Mapp2g
117
119
  tf.close
118
120
 
119
121
  exonerate_result = exec_exonerate(query, tf.path)
120
- return exonerate_result
121
-
122
+ return {
123
+ :blast_result => hit[:blast_result],
124
+ :exonerate_result => exonerate_result
125
+ }
122
126
  else
123
127
  return nil
124
128
  end
@@ -0,0 +1,75 @@
1
+ module Mapp2g
2
+
3
+ class ExonerateOutput
4
+
5
+ def self.load(file)
6
+ self.new(File.read(file))
7
+ end
8
+
9
+ # @param exonerate_out [String] exonerate output text, not file path
10
+ def initialize(exonerate_out)
11
+ @exonerate_out = exonerate_out
12
+ @query_name = nil
13
+ @target = nil
14
+ @cigar = nil
15
+ @gff2_lines = []
16
+ #vulgar = nil
17
+ parse()
18
+ end
19
+
20
+ attr_reader :query_name, :target, :cigar, :gff2_lines
21
+
22
+ def parse(opt={})
23
+ @exonerate_out.each_line do |l|
24
+ if m = /\s+Query:\s/.match(l)
25
+ @query_name = m.post_match.chomp.split[0]
26
+ elsif m = /\s+Target:\s/.match(l)
27
+ @target = m.post_match.split[0]
28
+ elsif m = /^cigar:\s/.match(l)
29
+ @cigar = m.post_match.chomp
30
+ elsif /^#{@target}/ =~ l &&
31
+ (/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
32
+ (/\texon\t/.match(l) || /\tgene\t/.match(l))
33
+ @gff2_lines << l.chomp
34
+ end
35
+ end
36
+ end
37
+
38
+ def to_gff3(opt={})
39
+ gff3_lines = []
40
+ @gff2_lines.each do |l|
41
+ a = l.chomp.split(/\t/)
42
+ b = Array.new(9)
43
+ a.each_with_index{|x, i| b[i] = x}
44
+ if b[2] == "gene"
45
+ b[2] = "match"
46
+ orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
47
+ # p orig_attribute
48
+ c = @cigar.split(/\s+/)
49
+ cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
50
+ attribute = {'ID' => @query_name,
51
+ 'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
52
+ 'Gap'=> cigar_pairs.join(" "),
53
+ 'identity' => orig_attribute['identity'],
54
+ 'similarity' => orig_attribute['similarity']}
55
+ b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
56
+ elsif b[2] == "exon"
57
+ b[2] = "match_part"
58
+ orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
59
+ attribute = {'Parent' => @query_name,
60
+ 'identity' => orig_attribute['identity'],
61
+ 'similarity' => orig_attribute['similarity']}
62
+ b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
63
+
64
+ else
65
+ raise
66
+ end
67
+ gff3_lines << b.join("\t")
68
+ end
69
+ return gff3_lines.join("\n")
70
+ end
71
+
72
+ end
73
+
74
+ end
75
+
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Mapp2g
4
- VERSION = "0.1.4"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/mapp2g.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "mapp2g/mapper"
4
+ require_relative "mapp2g/report"
4
5
  require_relative "mapp2g/version"
5
6
 
6
7
  module Mapp2g
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mapp2g
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-09 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: mapp2g is a bioinformatics software, which map and align protein sequences
14
14
  (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
@@ -28,6 +28,7 @@ files:
28
28
  - exe/mapp2g
29
29
  - lib/mapp2g.rb
30
30
  - lib/mapp2g/mapper.rb
31
+ - lib/mapp2g/report.rb
31
32
  - lib/mapp2g/version.rb
32
33
  - mapp2g.gemspec
33
34
  - scripts/add_annotation_from_uniprot_fasta_to_gff.rb
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
58
  - !ruby/object:Gem::Version
58
59
  version: '0'
59
60
  requirements: []
60
- rubygems_version: 3.4.10
61
+ rubygems_version: 3.4.15
61
62
  signing_key:
62
63
  specification_version: 4
63
64
  summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware