mapp2g 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -0
- data/exe/mapp2g +41 -3
- data/lib/mapp2g/mapper.rb +15 -11
- data/lib/mapp2g/report.rb +75 -0
- data/lib/mapp2g/version.rb +1 -1
- data/lib/mapp2g.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
|
4
|
+
data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
|
7
|
+
data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
|
data/README.md
CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
|
|
26
26
|
-h, --help show this help message and exit
|
27
27
|
```
|
28
28
|
|
29
|
+
Query sequences should be in FASTA format. Multiple sequences can be included in one file.
|
30
|
+
|
31
|
+
|
29
32
|
(example)
|
30
33
|
```
|
31
34
|
mapp2g -q human_genome.fasta -q p53.protein.fasta
|
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
|
|
37
40
|
makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
|
38
41
|
```
|
39
42
|
|
43
|
+
## Outputs
|
44
|
+
|
45
|
+
For each query, the following files are generated.
|
46
|
+
|
47
|
+
- query sequence in fasta
|
48
|
+
- blast output in tab-delmited format (format 6)
|
49
|
+
- exonerate full output
|
50
|
+
- exonerate alignment in gff3 format
|
51
|
+
- report.json
|
52
|
+
|
53
|
+
report.json contains all of the information above in json line format.
|
54
|
+
|
40
55
|
|
41
56
|
## License
|
42
57
|
|
data/exe/mapp2g
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
#require_relative '../lib/mapp2g' # for development
|
4
|
+
require 'mapp2g'
|
4
5
|
require 'bio'
|
5
6
|
require 'tempfile'
|
6
7
|
require 'optparse'
|
8
|
+
require 'json'
|
7
9
|
|
8
10
|
### Parse options
|
9
11
|
|
@@ -62,6 +64,9 @@ rescue => e
|
|
62
64
|
end
|
63
65
|
|
64
66
|
begin
|
67
|
+
unless File.exist?(genome)
|
68
|
+
raise "genome file (#{genome}) not found"
|
69
|
+
end
|
65
70
|
unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
|
66
71
|
raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
|
67
72
|
end
|
@@ -72,6 +77,8 @@ end
|
|
72
77
|
|
73
78
|
### Main
|
74
79
|
|
80
|
+
report_json_lines = []
|
81
|
+
|
75
82
|
Dir.mkdir(outdir)
|
76
83
|
|
77
84
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
@@ -80,11 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
|
80
87
|
tf.close
|
81
88
|
id = (i + 1).to_s
|
82
89
|
query_file_path = "#{outdir}/#{id}.fasta"
|
83
|
-
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
84
90
|
File.open(query_file_path, "w"){|o| o.puts fas}
|
85
91
|
|
86
92
|
mapper = Mapp2g::Mapper.new()
|
87
93
|
res = mapper.run(query_file_path, genome)
|
88
|
-
|
94
|
+
if res
|
95
|
+
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
96
|
+
File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
|
97
|
+
out_file_path = "#{outdir}/#{id}.blast.txt"
|
98
|
+
File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
|
99
|
+
gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
|
100
|
+
out_file_path = "#{outdir}/#{id}.exonerate.gff3"
|
101
|
+
File.open(out_file_path, "w"){|o| o.puts gff3}
|
102
|
+
|
103
|
+
report = {
|
104
|
+
"runtime_id" => id,
|
105
|
+
"query_id" => fas.entry_id,
|
106
|
+
"query_fasta" => fas.to_s,
|
107
|
+
"exonerate" => res[:exonerate_result],
|
108
|
+
"blast" => res[:blast_result],
|
109
|
+
"gff3" => gff3
|
110
|
+
}
|
111
|
+
report_json_lines << report.to_json
|
112
|
+
|
113
|
+
else
|
114
|
+
report = {
|
115
|
+
"runtime_id" => id,
|
116
|
+
"query_id" => fas.entry_id,
|
117
|
+
"query_fasta" => fas.to_s,
|
118
|
+
"exonerate" => nil,
|
119
|
+
"blast" => nil,
|
120
|
+
"gff3" => nil
|
121
|
+
}
|
122
|
+
STDERR.puts "No hit for #{fas.entry_id}"
|
123
|
+
end
|
89
124
|
|
90
125
|
end
|
126
|
+
|
127
|
+
report_json = report_json_lines.join("\n")
|
128
|
+
File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
|
data/lib/mapp2g/mapper.rb
CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
|
|
4
4
|
|
5
5
|
class Mapper
|
6
6
|
|
7
|
-
EVALUE_DEFAULT = 1.0e-
|
7
|
+
EVALUE_DEFAULT = 1.0e-5
|
8
8
|
NCPU_DEFAULT = 4
|
9
|
-
MAX_HSP_INTERVAL =
|
9
|
+
MAX_HSP_INTERVAL = 400000
|
10
10
|
EXTENSION = 50000
|
11
11
|
TMPDIR_DEFAULT = Dir.tmpdir
|
12
12
|
|
@@ -15,11 +15,11 @@ module Mapp2g
|
|
15
15
|
|
16
16
|
## step 1: tblastn
|
17
17
|
def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
|
18
|
-
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
|
18
|
+
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
|
19
19
|
# puts cmd
|
20
20
|
res = nil
|
21
|
-
IO.popen(cmd){|io| res = io.read}
|
22
|
-
|
21
|
+
IO.popen(cmd){|io| res = io.read}
|
22
|
+
# STDERR.puts res
|
23
23
|
if res == ""
|
24
24
|
## no hit
|
25
25
|
return nil
|
@@ -27,7 +27,7 @@ module Mapp2g
|
|
27
27
|
lines = []
|
28
28
|
prev_chr = nil
|
29
29
|
res.split(/\n/).each do |l|
|
30
|
-
|
30
|
+
# p prev_chr
|
31
31
|
a = l.chomp.split(/\t/)
|
32
32
|
unless prev_chr
|
33
33
|
lines << l
|
@@ -42,7 +42,7 @@ module Mapp2g
|
|
42
42
|
a = lines.shift.chomp.split(/\t/)
|
43
43
|
left, right = [a[8].to_i, a[9].to_i].sort
|
44
44
|
|
45
|
-
STDERR.puts [left, right].inspect
|
45
|
+
# STDERR.puts [left, right].inspect
|
46
46
|
|
47
47
|
lines.each do |l|
|
48
48
|
a = l.chomp.split(/\t/)
|
@@ -57,14 +57,16 @@ module Mapp2g
|
|
57
57
|
break
|
58
58
|
end
|
59
59
|
end
|
60
|
-
|
60
|
+
# STDERR.puts [left, right].inspect
|
61
61
|
|
62
62
|
top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
|
63
63
|
h = {
|
64
64
|
:top_chromosome => top_chromosome,
|
65
65
|
:left => left,
|
66
|
-
:right => right
|
66
|
+
:right => right,
|
67
|
+
:blast_result => res
|
67
68
|
}
|
69
|
+
# STDERR.puts h.inspect
|
68
70
|
return h
|
69
71
|
end
|
70
72
|
end
|
@@ -117,8 +119,10 @@ module Mapp2g
|
|
117
119
|
tf.close
|
118
120
|
|
119
121
|
exonerate_result = exec_exonerate(query, tf.path)
|
120
|
-
return
|
121
|
-
|
122
|
+
return {
|
123
|
+
:blast_result => hit[:blast_result],
|
124
|
+
:exonerate_result => exonerate_result
|
125
|
+
}
|
122
126
|
else
|
123
127
|
return nil
|
124
128
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Mapp2g
|
2
|
+
|
3
|
+
class ExonerateOutput
|
4
|
+
|
5
|
+
def self.load(file)
|
6
|
+
self.new(File.read(file))
|
7
|
+
end
|
8
|
+
|
9
|
+
# @param exonerate_out [String] exonerate output text, not file path
|
10
|
+
def initialize(exonerate_out)
|
11
|
+
@exonerate_out = exonerate_out
|
12
|
+
@query_name = nil
|
13
|
+
@target = nil
|
14
|
+
@cigar = nil
|
15
|
+
@gff2_lines = []
|
16
|
+
#vulgar = nil
|
17
|
+
parse()
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :query_name, :target, :cigar, :gff2_lines
|
21
|
+
|
22
|
+
def parse(opt={})
|
23
|
+
@exonerate_out.each_line do |l|
|
24
|
+
if m = /\s+Query:\s/.match(l)
|
25
|
+
@query_name = m.post_match.chomp.split[0]
|
26
|
+
elsif m = /\s+Target:\s/.match(l)
|
27
|
+
@target = m.post_match.split[0]
|
28
|
+
elsif m = /^cigar:\s/.match(l)
|
29
|
+
@cigar = m.post_match.chomp
|
30
|
+
elsif /^#{@target}/ =~ l &&
|
31
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
32
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
33
|
+
@gff2_lines << l.chomp
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_gff3(opt={})
|
39
|
+
gff3_lines = []
|
40
|
+
@gff2_lines.each do |l|
|
41
|
+
a = l.chomp.split(/\t/)
|
42
|
+
b = Array.new(9)
|
43
|
+
a.each_with_index{|x, i| b[i] = x}
|
44
|
+
if b[2] == "gene"
|
45
|
+
b[2] = "match"
|
46
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
47
|
+
# p orig_attribute
|
48
|
+
c = @cigar.split(/\s+/)
|
49
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
50
|
+
attribute = {'ID' => @query_name,
|
51
|
+
'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
|
52
|
+
'Gap'=> cigar_pairs.join(" "),
|
53
|
+
'identity' => orig_attribute['identity'],
|
54
|
+
'similarity' => orig_attribute['similarity']}
|
55
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
56
|
+
elsif b[2] == "exon"
|
57
|
+
b[2] = "match_part"
|
58
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
59
|
+
attribute = {'Parent' => @query_name,
|
60
|
+
'identity' => orig_attribute['identity'],
|
61
|
+
'similarity' => orig_attribute['similarity']}
|
62
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
63
|
+
|
64
|
+
else
|
65
|
+
raise
|
66
|
+
end
|
67
|
+
gff3_lines << b.join("\t")
|
68
|
+
end
|
69
|
+
return gff3_lines.join("\n")
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
data/lib/mapp2g/version.rb
CHANGED
data/lib/mapp2g.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- exe/mapp2g
|
29
29
|
- lib/mapp2g.rb
|
30
30
|
- lib/mapp2g/mapper.rb
|
31
|
+
- lib/mapp2g/report.rb
|
31
32
|
- lib/mapp2g/version.rb
|
32
33
|
- mapp2g.gemspec
|
33
34
|
- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
|
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
58
|
- !ruby/object:Gem::Version
|
58
59
|
version: '0'
|
59
60
|
requirements: []
|
60
|
-
rubygems_version: 3.4.
|
61
|
+
rubygems_version: 3.4.15
|
61
62
|
signing_key:
|
62
63
|
specification_version: 4
|
63
64
|
summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
|