mapp2g 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -0
- data/exe/mapp2g +41 -3
- data/lib/mapp2g/mapper.rb +15 -11
- data/lib/mapp2g/report.rb +75 -0
- data/lib/mapp2g/version.rb +1 -1
- data/lib/mapp2g.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
|
4
|
+
data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
|
7
|
+
data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
|
data/README.md
CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
|
|
26
26
|
-h, --help show this help message and exit
|
27
27
|
```
|
28
28
|
|
29
|
+
Query sequences should be in FASTA format. Multiple sequences can be included in one file.
|
30
|
+
|
31
|
+
|
29
32
|
(example)
|
30
33
|
```
|
31
34
|
mapp2g -q human_genome.fasta -q p53.protein.fasta
|
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
|
|
37
40
|
makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
|
38
41
|
```
|
39
42
|
|
43
|
+
## Outputs
|
44
|
+
|
45
|
+
For each query, the following files are generated.
|
46
|
+
|
47
|
+
- query sequence in fasta
|
48
|
+
- blast output in tab-delmited format (format 6)
|
49
|
+
- exonerate full output
|
50
|
+
- exonerate alignment in gff3 format
|
51
|
+
- report.json
|
52
|
+
|
53
|
+
report.json contains all of the information above in json line format.
|
54
|
+
|
40
55
|
|
41
56
|
## License
|
42
57
|
|
data/exe/mapp2g
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
#require_relative '../lib/mapp2g' # for development
|
4
|
+
require 'mapp2g'
|
4
5
|
require 'bio'
|
5
6
|
require 'tempfile'
|
6
7
|
require 'optparse'
|
8
|
+
require 'json'
|
7
9
|
|
8
10
|
### Parse options
|
9
11
|
|
@@ -62,6 +64,9 @@ rescue => e
|
|
62
64
|
end
|
63
65
|
|
64
66
|
begin
|
67
|
+
unless File.exist?(genome)
|
68
|
+
raise "genome file (#{genome}) not found"
|
69
|
+
end
|
65
70
|
unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
|
66
71
|
raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
|
67
72
|
end
|
@@ -72,6 +77,8 @@ end
|
|
72
77
|
|
73
78
|
### Main
|
74
79
|
|
80
|
+
report_json_lines = []
|
81
|
+
|
75
82
|
Dir.mkdir(outdir)
|
76
83
|
|
77
84
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
@@ -80,11 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
|
80
87
|
tf.close
|
81
88
|
id = (i + 1).to_s
|
82
89
|
query_file_path = "#{outdir}/#{id}.fasta"
|
83
|
-
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
84
90
|
File.open(query_file_path, "w"){|o| o.puts fas}
|
85
91
|
|
86
92
|
mapper = Mapp2g::Mapper.new()
|
87
93
|
res = mapper.run(query_file_path, genome)
|
88
|
-
|
94
|
+
if res
|
95
|
+
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
96
|
+
File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
|
97
|
+
out_file_path = "#{outdir}/#{id}.blast.txt"
|
98
|
+
File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
|
99
|
+
gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
|
100
|
+
out_file_path = "#{outdir}/#{id}.exonerate.gff3"
|
101
|
+
File.open(out_file_path, "w"){|o| o.puts gff3}
|
102
|
+
|
103
|
+
report = {
|
104
|
+
"runtime_id" => id,
|
105
|
+
"query_id" => fas.entry_id,
|
106
|
+
"query_fasta" => fas.to_s,
|
107
|
+
"exonerate" => res[:exonerate_result],
|
108
|
+
"blast" => res[:blast_result],
|
109
|
+
"gff3" => gff3
|
110
|
+
}
|
111
|
+
report_json_lines << report.to_json
|
112
|
+
|
113
|
+
else
|
114
|
+
report = {
|
115
|
+
"runtime_id" => id,
|
116
|
+
"query_id" => fas.entry_id,
|
117
|
+
"query_fasta" => fas.to_s,
|
118
|
+
"exonerate" => nil,
|
119
|
+
"blast" => nil,
|
120
|
+
"gff3" => nil
|
121
|
+
}
|
122
|
+
STDERR.puts "No hit for #{fas.entry_id}"
|
123
|
+
end
|
89
124
|
|
90
125
|
end
|
126
|
+
|
127
|
+
report_json = report_json_lines.join("\n")
|
128
|
+
File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
|
data/lib/mapp2g/mapper.rb
CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
|
|
4
4
|
|
5
5
|
class Mapper
|
6
6
|
|
7
|
-
EVALUE_DEFAULT = 1.0e-
|
7
|
+
EVALUE_DEFAULT = 1.0e-5
|
8
8
|
NCPU_DEFAULT = 4
|
9
|
-
MAX_HSP_INTERVAL =
|
9
|
+
MAX_HSP_INTERVAL = 400000
|
10
10
|
EXTENSION = 50000
|
11
11
|
TMPDIR_DEFAULT = Dir.tmpdir
|
12
12
|
|
@@ -15,11 +15,11 @@ module Mapp2g
|
|
15
15
|
|
16
16
|
## step 1: tblastn
|
17
17
|
def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
|
18
|
-
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
|
18
|
+
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
|
19
19
|
# puts cmd
|
20
20
|
res = nil
|
21
|
-
IO.popen(cmd){|io| res = io.read}
|
22
|
-
|
21
|
+
IO.popen(cmd){|io| res = io.read}
|
22
|
+
# STDERR.puts res
|
23
23
|
if res == ""
|
24
24
|
## no hit
|
25
25
|
return nil
|
@@ -27,7 +27,7 @@ module Mapp2g
|
|
27
27
|
lines = []
|
28
28
|
prev_chr = nil
|
29
29
|
res.split(/\n/).each do |l|
|
30
|
-
|
30
|
+
# p prev_chr
|
31
31
|
a = l.chomp.split(/\t/)
|
32
32
|
unless prev_chr
|
33
33
|
lines << l
|
@@ -42,7 +42,7 @@ module Mapp2g
|
|
42
42
|
a = lines.shift.chomp.split(/\t/)
|
43
43
|
left, right = [a[8].to_i, a[9].to_i].sort
|
44
44
|
|
45
|
-
STDERR.puts [left, right].inspect
|
45
|
+
# STDERR.puts [left, right].inspect
|
46
46
|
|
47
47
|
lines.each do |l|
|
48
48
|
a = l.chomp.split(/\t/)
|
@@ -57,14 +57,16 @@ module Mapp2g
|
|
57
57
|
break
|
58
58
|
end
|
59
59
|
end
|
60
|
-
|
60
|
+
# STDERR.puts [left, right].inspect
|
61
61
|
|
62
62
|
top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
|
63
63
|
h = {
|
64
64
|
:top_chromosome => top_chromosome,
|
65
65
|
:left => left,
|
66
|
-
:right => right
|
66
|
+
:right => right,
|
67
|
+
:blast_result => res
|
67
68
|
}
|
69
|
+
# STDERR.puts h.inspect
|
68
70
|
return h
|
69
71
|
end
|
70
72
|
end
|
@@ -117,8 +119,10 @@ module Mapp2g
|
|
117
119
|
tf.close
|
118
120
|
|
119
121
|
exonerate_result = exec_exonerate(query, tf.path)
|
120
|
-
return
|
121
|
-
|
122
|
+
return {
|
123
|
+
:blast_result => hit[:blast_result],
|
124
|
+
:exonerate_result => exonerate_result
|
125
|
+
}
|
122
126
|
else
|
123
127
|
return nil
|
124
128
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Mapp2g
|
2
|
+
|
3
|
+
class ExonerateOutput
|
4
|
+
|
5
|
+
def self.load(file)
|
6
|
+
self.new(File.read(file))
|
7
|
+
end
|
8
|
+
|
9
|
+
# @param exonerate_out [String] exonerate output text, not file path
|
10
|
+
def initialize(exonerate_out)
|
11
|
+
@exonerate_out = exonerate_out
|
12
|
+
@query_name = nil
|
13
|
+
@target = nil
|
14
|
+
@cigar = nil
|
15
|
+
@gff2_lines = []
|
16
|
+
#vulgar = nil
|
17
|
+
parse()
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :query_name, :target, :cigar, :gff2_lines
|
21
|
+
|
22
|
+
def parse(opt={})
|
23
|
+
@exonerate_out.each_line do |l|
|
24
|
+
if m = /\s+Query:\s/.match(l)
|
25
|
+
@query_name = m.post_match.chomp.split[0]
|
26
|
+
elsif m = /\s+Target:\s/.match(l)
|
27
|
+
@target = m.post_match.split[0]
|
28
|
+
elsif m = /^cigar:\s/.match(l)
|
29
|
+
@cigar = m.post_match.chomp
|
30
|
+
elsif /^#{@target}/ =~ l &&
|
31
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
32
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
33
|
+
@gff2_lines << l.chomp
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_gff3(opt={})
|
39
|
+
gff3_lines = []
|
40
|
+
@gff2_lines.each do |l|
|
41
|
+
a = l.chomp.split(/\t/)
|
42
|
+
b = Array.new(9)
|
43
|
+
a.each_with_index{|x, i| b[i] = x}
|
44
|
+
if b[2] == "gene"
|
45
|
+
b[2] = "match"
|
46
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
47
|
+
# p orig_attribute
|
48
|
+
c = @cigar.split(/\s+/)
|
49
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
50
|
+
attribute = {'ID' => @query_name,
|
51
|
+
'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
|
52
|
+
'Gap'=> cigar_pairs.join(" "),
|
53
|
+
'identity' => orig_attribute['identity'],
|
54
|
+
'similarity' => orig_attribute['similarity']}
|
55
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
56
|
+
elsif b[2] == "exon"
|
57
|
+
b[2] = "match_part"
|
58
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
59
|
+
attribute = {'Parent' => @query_name,
|
60
|
+
'identity' => orig_attribute['identity'],
|
61
|
+
'similarity' => orig_attribute['similarity']}
|
62
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
63
|
+
|
64
|
+
else
|
65
|
+
raise
|
66
|
+
end
|
67
|
+
gff3_lines << b.join("\t")
|
68
|
+
end
|
69
|
+
return gff3_lines.join("\n")
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
data/lib/mapp2g/version.rb
CHANGED
data/lib/mapp2g.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- exe/mapp2g
|
29
29
|
- lib/mapp2g.rb
|
30
30
|
- lib/mapp2g/mapper.rb
|
31
|
+
- lib/mapp2g/report.rb
|
31
32
|
- lib/mapp2g/version.rb
|
32
33
|
- mapp2g.gemspec
|
33
34
|
- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
|
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
58
|
- !ruby/object:Gem::Version
|
58
59
|
version: '0'
|
59
60
|
requirements: []
|
60
|
-
rubygems_version: 3.4.
|
61
|
+
rubygems_version: 3.4.15
|
61
62
|
signing_key:
|
62
63
|
specification_version: 4
|
63
64
|
summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
|