mapp2g 0.1.3 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/mapp2g +35 -1
- data/lib/mapp2g/report.rb +75 -0
- data/lib/mapp2g/version.rb +1 -1
- data/lib/mapp2g.rb +1 -0
- data/scripts/add_annotation_from_uniprot_fasta_to_gff.rb +117 -0
- data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb +37 -0
- data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb +70 -0
- metadata +10 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea23c58705cef813135bf96b23383cdeca589643ff08a25ba8b95fd473e26449
|
4
|
+
data.tar.gz: 7be36a76318408344b4402fafda2393ba51d6e2e36fe2c50e6ad393817ff6cbb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfed44714ac742bd9cc8a17169f85c691292e2fe1d7c4d62960d3babe0cd61cf59b01c0b333af063975aef50f17c99aab9f06103b662da03e00813d456f45e02
|
7
|
+
data.tar.gz: c4f3711b154e3ad3d730dde9e5637061514563f49607a814c5bbf2a674de3b06a81f96926aac9d8c32bc5feedb06c7a3dab8a481a8f122b138d60530b923af7c
|
data/exe/mapp2g
CHANGED
@@ -5,6 +5,7 @@ require 'bio'
|
|
5
5
|
require 'tempfile'
|
6
6
|
require 'optparse'
|
7
7
|
|
8
|
+
### Parse options
|
8
9
|
|
9
10
|
opt = OptionParser.new
|
10
11
|
OPTS = {}
|
@@ -30,7 +31,6 @@ begin
|
|
30
31
|
|
31
32
|
opt.parse!(ARGV)
|
32
33
|
|
33
|
-
|
34
34
|
rescue => e
|
35
35
|
puts "ERROR: #{e}\nSee #{opt}"
|
36
36
|
exit
|
@@ -44,6 +44,37 @@ outdir = (OPTS[:o] || "mapp2g_out_#{$$}")
|
|
44
44
|
|
45
45
|
#p [query, genome, outdir]
|
46
46
|
|
47
|
+
### Check environment
|
48
|
+
|
49
|
+
def command?(name)
|
50
|
+
`which #{name}`
|
51
|
+
unless $?.success?
|
52
|
+
raise "#{name} command not found"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
begin
|
57
|
+
command?("exonerate")
|
58
|
+
command?("blastn")
|
59
|
+
rescue => e
|
60
|
+
puts "ERROR: #{e}"
|
61
|
+
exit(1)
|
62
|
+
end
|
63
|
+
|
64
|
+
begin
|
65
|
+
unless File.exist?(genome)
|
66
|
+
raise "genome file (#{genome}) not found"
|
67
|
+
end
|
68
|
+
unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
|
69
|
+
raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
|
70
|
+
end
|
71
|
+
rescue => e
|
72
|
+
puts "ERROR: #{e}"
|
73
|
+
exit(1)
|
74
|
+
end
|
75
|
+
|
76
|
+
### Main
|
77
|
+
|
47
78
|
Dir.mkdir(outdir)
|
48
79
|
|
49
80
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
@@ -59,4 +90,7 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
|
59
90
|
res = mapper.run(query_file_path, genome)
|
60
91
|
File.open(out_file_path, "w"){|o| o.puts res}
|
61
92
|
|
93
|
+
gff3 = Mapp2g::ExonerateOutput.new(res).to_gff3()
|
94
|
+
out_file_path = "#{outdir}/#{id}.exonerate.gff3"
|
95
|
+
File.open(out_file_path, "w"){|o| o.puts gff3}
|
62
96
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Mapp2g
|
2
|
+
|
3
|
+
class ExonerateOutput
|
4
|
+
|
5
|
+
def self.load(file)
|
6
|
+
self.new(File.read(file))
|
7
|
+
end
|
8
|
+
|
9
|
+
# @param exonerate_out [String] exonerate output text, not file path
|
10
|
+
def initialize(exonerate_out)
|
11
|
+
@exonerate_out = exonerate_out
|
12
|
+
@query_name = nil
|
13
|
+
@target = nil
|
14
|
+
@cigar = nil
|
15
|
+
@gff2_lines = []
|
16
|
+
#vulgar = nil
|
17
|
+
parse()
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :query_name, :target, :cigar, :gff2_lines
|
21
|
+
|
22
|
+
def parse(opt={})
|
23
|
+
@exonerate_out.each_line do |l|
|
24
|
+
if m = /\s+Query:\s/.match(l)
|
25
|
+
@query_name = m.post_match.chomp.split[0]
|
26
|
+
elsif m = /\s+Target:\s/.match(l)
|
27
|
+
@target = m.post_match.split[0]
|
28
|
+
elsif m = /^cigar:\s/.match(l)
|
29
|
+
@cigar = m.post_match.chomp
|
30
|
+
elsif /^#{@target}/ =~ l &&
|
31
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
32
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
33
|
+
@gff2_lines << l.chomp
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_gff3(opt={})
|
39
|
+
gff3_lines = []
|
40
|
+
@gff2_lines.each do |l|
|
41
|
+
a = l.chomp.split(/\t/)
|
42
|
+
b = Array.new(9)
|
43
|
+
a.each_with_index{|x, i| b[i] = x}
|
44
|
+
if b[2] == "gene"
|
45
|
+
b[2] = "match"
|
46
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
47
|
+
# p orig_attribute
|
48
|
+
c = @cigar.split(/\s+/)
|
49
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
50
|
+
attribute = {'ID' => @query_name,
|
51
|
+
'Target' => [@query_name, c[1].to_i + 1, c[2]].join(" "),
|
52
|
+
'Gap'=> cigar_pairs.join(" "),
|
53
|
+
'identity' => orig_attribute['identity'],
|
54
|
+
'similarity' => orig_attribute['similarity']}
|
55
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
56
|
+
elsif b[2] == "exon"
|
57
|
+
b[2] = "match_part"
|
58
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
59
|
+
attribute = {'Parent' => @query_name,
|
60
|
+
'identity' => orig_attribute['identity'],
|
61
|
+
'similarity' => orig_attribute['similarity']}
|
62
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
63
|
+
|
64
|
+
else
|
65
|
+
raise
|
66
|
+
end
|
67
|
+
gff3_lines << b.join("\t")
|
68
|
+
end
|
69
|
+
return gff3_lines.join("\n")
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
data/lib/mapp2g/version.rb
CHANGED
data/lib/mapp2g.rb
CHANGED
@@ -0,0 +1,117 @@
|
|
1
|
+
#===
|
2
|
+
# add_annotation_from_uniprot_fasta_to_gff.rb
|
3
|
+
#
|
4
|
+
# This script adds annotation from UniProt FASTA file to GFF file.
|
5
|
+
#
|
6
|
+
# Usage: ruby add_annotation_from_uniprot_fasta_to_gff.rb <uniprot_proteome_referene_fasta> <gff>
|
7
|
+
#
|
8
|
+
# Example: ruby add_annotation_from_uniprot_fasta_to_gff.rb UP000001593_45351.fasta mygff.gff3 > mygff_with_annotation.gff3
|
9
|
+
#
|
10
|
+
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
# https://www.uniprot.org/help/fasta-headers
|
14
|
+
#
|
15
|
+
# UniProtKB
|
16
|
+
#
|
17
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
18
|
+
#
|
19
|
+
# Where:
|
20
|
+
#
|
21
|
+
# db is 'sp' for UniProtKB/Swiss-Prot and 'tr' for UniProtKB/TrEMBL.
|
22
|
+
# UniqueIdentifier is the primary accession number of the UniProtKB entry.
|
23
|
+
# EntryName is the entry name of the UniProtKB entry.
|
24
|
+
# ProteinName is the recommended name of the UniProtKB entry as annotated in the RecName field. For UniProtKB/TrEMBL entries without a RecName field, the SubName field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable.
|
25
|
+
# OrganismName is the scientific name of the organism of the UniProtKB entry.
|
26
|
+
# OrganismIdentifier is the unique identifier of the source organism, assigned by the NCBI.
|
27
|
+
# GeneName is the first gene name of the UniProtKB entry. If there is no gene name, OrderedLocusName or ORFname, the GN field is not listed.
|
28
|
+
# ProteinExistence is the numerical value describing the evidence for the existence of the protein.
|
29
|
+
# SequenceVersion is the version number of the sequence.
|
30
|
+
|
31
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
32
|
+
|
33
|
+
fastaf = ARGV[0]
|
34
|
+
gfff = ARGV[1]
|
35
|
+
data = Hash.new
|
36
|
+
|
37
|
+
|
38
|
+
module Escape
|
39
|
+
|
40
|
+
# ref: https://github.com/bioruby/bioruby/blob/master/lib/bio/db/gff.rb
|
41
|
+
|
42
|
+
# unsafe characters to be escaped for normal columns
|
43
|
+
UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
|
44
|
+
|
45
|
+
# unsafe characters to be escaped for seqid columns
|
46
|
+
# and target_id of the "Target" attribute
|
47
|
+
UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
|
48
|
+
|
49
|
+
# unsafe characters to be escaped for attribute columns
|
50
|
+
UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
|
51
|
+
|
52
|
+
URI_PARSER = URI::Parser.new
|
53
|
+
|
54
|
+
def self.escape_attribute(str)
|
55
|
+
URI_PARSER.escape(str, UNSAFE_ATTRIBUTE)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
File.open(fastaf).each do |l|
|
61
|
+
if /^>/.match(l)
|
62
|
+
# puts l
|
63
|
+
m = /^>(\S+)\|(\S+)\|(\S+)\s+(.+)\s+OS\=(.+)\s+OX\=(\d+)\s+(GN\=(.+)\s+)?PE\=(\d+)\s+SV\=(\d+)/.match(l)
|
64
|
+
db = m[1]
|
65
|
+
id = m[2]
|
66
|
+
entry_name = m[3]
|
67
|
+
protein_name = m[4]
|
68
|
+
organism_name = m[5]
|
69
|
+
organism_id = m[6]
|
70
|
+
gene_name = m[8]
|
71
|
+
prot_exis = m[9]
|
72
|
+
seq_ver = m[10]
|
73
|
+
|
74
|
+
h = {
|
75
|
+
:db => db,
|
76
|
+
:id => id,
|
77
|
+
:entry_name => entry_name,
|
78
|
+
:protein_name => protein_name,
|
79
|
+
:organism_name => organism_name,
|
80
|
+
:organism_id => organism_id,
|
81
|
+
:gene_name => gene_name,
|
82
|
+
:prot_exis => prot_exis,
|
83
|
+
:seq_ver => seq_ver
|
84
|
+
}
|
85
|
+
data[id] = h
|
86
|
+
|
87
|
+
# p [db, id, entry_name, protein_name, organism_name, organism_id]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
File.open(gfff).each do |l|
|
93
|
+
a = l.chomp.split(/\t/)
|
94
|
+
col_type = a[2]
|
95
|
+
col_attr = a[8]
|
96
|
+
if col_type == "match"
|
97
|
+
id = /ID\=(.+)/.match(col_attr)[1]
|
98
|
+
# p id
|
99
|
+
d = data[id]
|
100
|
+
h = Hash.new
|
101
|
+
dbxref = Hash.new
|
102
|
+
h['Name'] = Escape.escape_attribute(d[:protein_name])
|
103
|
+
dbxref['EMBL'] = d[:id]
|
104
|
+
dbxref['Uniprot'] = d[:entry_name]
|
105
|
+
dbxref['tax'] = d[:organism_id]
|
106
|
+
h['Dbxref'] = dbxref.map{|k, v| "#{k}:#{Escape.escape_attribute(v)}"}.join(",")
|
107
|
+
if d[:gene_name]
|
108
|
+
h['Alias'] = [Escape.escape_attribute(d[:gene_name])].join(",")
|
109
|
+
end
|
110
|
+
new_attr_col = col_attr + ";" + h.map{|k, v| "#{k}=#{v}"}.join(";")
|
111
|
+
b = a.dup
|
112
|
+
b[8] = new_attr_col
|
113
|
+
puts b.join("\t")
|
114
|
+
else
|
115
|
+
puts l
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
gff_lines = []
|
2
|
+
|
3
|
+
i = 0
|
4
|
+
name = nil
|
5
|
+
target = nil
|
6
|
+
|
7
|
+
ARGF.each do |l|
|
8
|
+
|
9
|
+
if m = /\s+Query:\s/.match(l)
|
10
|
+
name = m.post_match.chomp.split[0]
|
11
|
+
i+=1
|
12
|
+
STDERR.puts "#{i} records processed" if i % 1000 == 0
|
13
|
+
elsif m = /\s+Target:\s/.match(l)
|
14
|
+
target = m.post_match.split[0]
|
15
|
+
elsif /^#{target}/.match(l) &&
|
16
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
17
|
+
(/\tcds\t/.match(l) || /\texon\t/.match(l) || /\tgene\t/.match(l))
|
18
|
+
# puts l
|
19
|
+
a = l.chomp.split(/\t/)
|
20
|
+
b = Array.new(9)
|
21
|
+
a.each_with_index{|x, i| b[i] = x}
|
22
|
+
if b[2] == "gene"
|
23
|
+
b[-1] = "ID=#{name}"
|
24
|
+
b[2] = "match"
|
25
|
+
elsif (b[2] == "cds" || b[2] == "exon")
|
26
|
+
b[-1] = "Parent=#{name}"
|
27
|
+
b[2] = "match_part"
|
28
|
+
else
|
29
|
+
raise
|
30
|
+
end
|
31
|
+
gff_lines << b.join("\t")
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
puts gff_lines
|
37
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
exonerate_out = (ARGV[0] || "mapp2g_out_ClyHem/mapp2g_out_ClyHem_j609t87/57.exonerate.txt")
|
4
|
+
|
5
|
+
query_name = nil
|
6
|
+
target = nil
|
7
|
+
cigar = nil
|
8
|
+
gff2_lines = []
|
9
|
+
#vulgar = nil
|
10
|
+
|
11
|
+
File.open(exonerate_out, "r").each do |l|
|
12
|
+
|
13
|
+
if m = /\s+Query:\s/.match(l)
|
14
|
+
query_name = m.post_match.chomp.split[0]
|
15
|
+
elsif m = /\s+Target:\s/.match(l)
|
16
|
+
target = m.post_match.split[0]
|
17
|
+
elsif m = /^cigar:\s/.match(l)
|
18
|
+
cigar = m.post_match.chomp
|
19
|
+
elsif /^#{target}/.match(l) &&
|
20
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
21
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
22
|
+
gff2_lines << l.chomp
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
#puts gff2_lines
|
28
|
+
|
29
|
+
gff2_lines.each do |l|
|
30
|
+
a = l.chomp.split(/\t/)
|
31
|
+
b = Array.new(9)
|
32
|
+
a.each_with_index{|x, i| b[i] = x}
|
33
|
+
if b[2] == "gene"
|
34
|
+
b[2] = "match"
|
35
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
36
|
+
# p orig_attribute
|
37
|
+
c = cigar.split(/\s+/)
|
38
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
39
|
+
attribute = {'ID' => query_name,
|
40
|
+
'Target' => [query_name, c[1].to_i + 1, c[2]].join(" "),
|
41
|
+
'Gap'=> cigar_pairs.join(" "),
|
42
|
+
'identity' => orig_attribute['identity'],
|
43
|
+
'similarity' => orig_attribute['similarity']}
|
44
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
45
|
+
elsif b[2] == "exon"
|
46
|
+
b[2] = "match_part"
|
47
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
48
|
+
attribute = {'Parent' => query_name,
|
49
|
+
'identity' => orig_attribute['identity'],
|
50
|
+
'similarity' => orig_attribute['similarity']}
|
51
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
52
|
+
|
53
|
+
else
|
54
|
+
raise
|
55
|
+
end
|
56
|
+
puts b.join("\t")
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# CIGAR format
|
61
|
+
# The format starts with the same 9 fields as sugar output (see above), and is followed by a series of <operation, length> pairs where operation is one of match, insert or delete, and the length describes the number of times this operation is repeated.
|
62
|
+
# 1 query_id: Query identifier
|
63
|
+
# 2 query_start: Query position at alignment start
|
64
|
+
# 3 query_end: Query position alignment end
|
65
|
+
# 4 query_strand: Strand of query matched
|
66
|
+
# 5 target_id|
|
67
|
+
# 6 target_start| the same 4 fields
|
68
|
+
# 7 target_end | for the target sequence
|
69
|
+
# 8 target_strand|
|
70
|
+
# 9 score| The raw alignment score
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|
@@ -28,9 +28,13 @@ files:
|
|
28
28
|
- exe/mapp2g
|
29
29
|
- lib/mapp2g.rb
|
30
30
|
- lib/mapp2g/mapper.rb
|
31
|
+
- lib/mapp2g/report.rb
|
31
32
|
- lib/mapp2g/version.rb
|
32
33
|
- mapp2g.gemspec
|
34
|
+
- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
|
33
35
|
- scripts/mapp2g-exonerate_gff2_to_jbgff3.rb
|
36
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb
|
37
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb
|
34
38
|
- sig/mapp2g.rbs
|
35
39
|
homepage: https://github.com/shujishigenobu/mapp2g
|
36
40
|
licenses:
|
@@ -39,7 +43,7 @@ metadata:
|
|
39
43
|
homepage_uri: https://github.com/shujishigenobu/mapp2g
|
40
44
|
source_code_uri: https://github.com/shujishigenobu/mapp2g
|
41
45
|
changelog_uri: https://github.com/shujishigenobu/mapp2g
|
42
|
-
post_install_message:
|
46
|
+
post_install_message:
|
43
47
|
rdoc_options: []
|
44
48
|
require_paths:
|
45
49
|
- lib
|
@@ -54,8 +58,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
58
|
- !ruby/object:Gem::Version
|
55
59
|
version: '0'
|
56
60
|
requirements: []
|
57
|
-
rubygems_version: 3.
|
58
|
-
signing_key:
|
61
|
+
rubygems_version: 3.4.15
|
62
|
+
signing_key:
|
59
63
|
specification_version: 4
|
60
64
|
summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
|
61
65
|
way.
|