mapp2g 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -0
- data/exe/mapp2g +38 -6
- data/lib/mapp2g/mapper.rb +15 -11
- data/lib/mapp2g/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
|
4
|
+
data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
|
7
|
+
data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
|
data/README.md
CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
|
|
26
26
|
-h, --help show this help message and exit
|
27
27
|
```
|
28
28
|
|
29
|
+
Query sequences should be in FASTA format. Multiple sequences can be included in one file.
|
30
|
+
|
31
|
+
|
29
32
|
(example)
|
30
33
|
```
|
31
34
|
mapp2g -q human_genome.fasta -q p53.protein.fasta
|
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
|
|
37
40
|
makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
|
38
41
|
```
|
39
42
|
|
43
|
+
## Outputs
|
44
|
+
|
45
|
+
For each query, the following files are generated.
|
46
|
+
|
47
|
+
- query sequence in fasta
|
48
|
+
- blast output in tab-delmited format (format 6)
|
49
|
+
- exonerate full output
|
50
|
+
- exonerate alignment in gff3 format
|
51
|
+
- report.json
|
52
|
+
|
53
|
+
report.json contains all of the information above in json line format.
|
54
|
+
|
40
55
|
|
41
56
|
## License
|
42
57
|
|
data/exe/mapp2g
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
#require_relative '../lib/mapp2g' # for development
|
4
|
+
require 'mapp2g'
|
4
5
|
require 'bio'
|
5
6
|
require 'tempfile'
|
6
7
|
require 'optparse'
|
8
|
+
require 'json'
|
7
9
|
|
8
10
|
### Parse options
|
9
11
|
|
@@ -75,6 +77,8 @@ end
|
|
75
77
|
|
76
78
|
### Main
|
77
79
|
|
80
|
+
report_json_lines = []
|
81
|
+
|
78
82
|
Dir.mkdir(outdir)
|
79
83
|
|
80
84
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
@@ -83,14 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
|
83
87
|
tf.close
|
84
88
|
id = (i + 1).to_s
|
85
89
|
query_file_path = "#{outdir}/#{id}.fasta"
|
86
|
-
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
87
90
|
File.open(query_file_path, "w"){|o| o.puts fas}
|
88
91
|
|
89
92
|
mapper = Mapp2g::Mapper.new()
|
90
93
|
res = mapper.run(query_file_path, genome)
|
91
|
-
|
94
|
+
if res
|
95
|
+
out_file_path = "#{outdir}/#{id}.exonerate.txt"
|
96
|
+
File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
|
97
|
+
out_file_path = "#{outdir}/#{id}.blast.txt"
|
98
|
+
File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
|
99
|
+
gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
|
100
|
+
out_file_path = "#{outdir}/#{id}.exonerate.gff3"
|
101
|
+
File.open(out_file_path, "w"){|o| o.puts gff3}
|
102
|
+
|
103
|
+
report = {
|
104
|
+
"runtime_id" => id,
|
105
|
+
"query_id" => fas.entry_id,
|
106
|
+
"query_fasta" => fas.to_s,
|
107
|
+
"exonerate" => res[:exonerate_result],
|
108
|
+
"blast" => res[:blast_result],
|
109
|
+
"gff3" => gff3
|
110
|
+
}
|
111
|
+
report_json_lines << report.to_json
|
112
|
+
|
113
|
+
else
|
114
|
+
report = {
|
115
|
+
"runtime_id" => id,
|
116
|
+
"query_id" => fas.entry_id,
|
117
|
+
"query_fasta" => fas.to_s,
|
118
|
+
"exonerate" => nil,
|
119
|
+
"blast" => nil,
|
120
|
+
"gff3" => nil
|
121
|
+
}
|
122
|
+
STDERR.puts "No hit for #{fas.entry_id}"
|
123
|
+
end
|
92
124
|
|
93
|
-
gff3 = Mapp2g::ExonerateOutput.new(res).to_gff3()
|
94
|
-
out_file_path = "#{outdir}/#{id}.exonerate.gff3"
|
95
|
-
File.open(out_file_path, "w"){|o| o.puts gff3}
|
96
125
|
end
|
126
|
+
|
127
|
+
report_json = report_json_lines.join("\n")
|
128
|
+
File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
|
data/lib/mapp2g/mapper.rb
CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
|
|
4
4
|
|
5
5
|
class Mapper
|
6
6
|
|
7
|
-
EVALUE_DEFAULT = 1.0e-
|
7
|
+
EVALUE_DEFAULT = 1.0e-5
|
8
8
|
NCPU_DEFAULT = 4
|
9
|
-
MAX_HSP_INTERVAL =
|
9
|
+
MAX_HSP_INTERVAL = 400000
|
10
10
|
EXTENSION = 50000
|
11
11
|
TMPDIR_DEFAULT = Dir.tmpdir
|
12
12
|
|
@@ -15,11 +15,11 @@ module Mapp2g
|
|
15
15
|
|
16
16
|
## step 1: tblastn
|
17
17
|
def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
|
18
|
-
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
|
18
|
+
cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
|
19
19
|
# puts cmd
|
20
20
|
res = nil
|
21
|
-
IO.popen(cmd){|io| res = io.read}
|
22
|
-
|
21
|
+
IO.popen(cmd){|io| res = io.read}
|
22
|
+
# STDERR.puts res
|
23
23
|
if res == ""
|
24
24
|
## no hit
|
25
25
|
return nil
|
@@ -27,7 +27,7 @@ module Mapp2g
|
|
27
27
|
lines = []
|
28
28
|
prev_chr = nil
|
29
29
|
res.split(/\n/).each do |l|
|
30
|
-
|
30
|
+
# p prev_chr
|
31
31
|
a = l.chomp.split(/\t/)
|
32
32
|
unless prev_chr
|
33
33
|
lines << l
|
@@ -42,7 +42,7 @@ module Mapp2g
|
|
42
42
|
a = lines.shift.chomp.split(/\t/)
|
43
43
|
left, right = [a[8].to_i, a[9].to_i].sort
|
44
44
|
|
45
|
-
STDERR.puts [left, right].inspect
|
45
|
+
# STDERR.puts [left, right].inspect
|
46
46
|
|
47
47
|
lines.each do |l|
|
48
48
|
a = l.chomp.split(/\t/)
|
@@ -57,14 +57,16 @@ module Mapp2g
|
|
57
57
|
break
|
58
58
|
end
|
59
59
|
end
|
60
|
-
|
60
|
+
# STDERR.puts [left, right].inspect
|
61
61
|
|
62
62
|
top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
|
63
63
|
h = {
|
64
64
|
:top_chromosome => top_chromosome,
|
65
65
|
:left => left,
|
66
|
-
:right => right
|
66
|
+
:right => right,
|
67
|
+
:blast_result => res
|
67
68
|
}
|
69
|
+
# STDERR.puts h.inspect
|
68
70
|
return h
|
69
71
|
end
|
70
72
|
end
|
@@ -117,8 +119,10 @@ module Mapp2g
|
|
117
119
|
tf.close
|
118
120
|
|
119
121
|
exonerate_result = exec_exonerate(query, tf.path)
|
120
|
-
return
|
121
|
-
|
122
|
+
return {
|
123
|
+
:blast_result => hit[:blast_result],
|
124
|
+
:exonerate_result => exonerate_result
|
125
|
+
}
|
122
126
|
else
|
123
127
|
return nil
|
124
128
|
end
|
data/lib/mapp2g/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|