mapp2g 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
|
4
|
+
data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
|
7
|
+
data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2
|
data/exe/mapp2g
CHANGED
@@ -5,6 +5,7 @@ require 'bio'
|
|
5
5
|
require 'tempfile'
|
6
6
|
require 'optparse'
|
7
7
|
|
8
|
+
### Parse options
|
8
9
|
|
9
10
|
opt = OptionParser.new
|
10
11
|
OPTS = {}
|
@@ -30,7 +31,6 @@ begin
|
|
30
31
|
|
31
32
|
opt.parse!(ARGV)
|
32
33
|
|
33
|
-
|
34
34
|
rescue => e
|
35
35
|
puts "ERROR: #{e}\nSee #{opt}"
|
36
36
|
exit
|
@@ -44,6 +44,34 @@ outdir = (OPTS[:o] || "mapp2g_out_#{$$}")
|
|
44
44
|
|
45
45
|
#p [query, genome, outdir]
|
46
46
|
|
47
|
+
### Check environment
|
48
|
+
|
49
|
+
def command?(name)
|
50
|
+
`which #{name}`
|
51
|
+
unless $?.success?
|
52
|
+
raise "#{name} command not found"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
begin
|
57
|
+
command?("exonerate")
|
58
|
+
command?("blastn")
|
59
|
+
rescue => e
|
60
|
+
puts "ERROR: #{e}"
|
61
|
+
exit(1)
|
62
|
+
end
|
63
|
+
|
64
|
+
begin
|
65
|
+
unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
|
66
|
+
raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
|
67
|
+
end
|
68
|
+
rescue => e
|
69
|
+
puts "ERROR: #{e}"
|
70
|
+
exit(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
### Main
|
74
|
+
|
47
75
|
Dir.mkdir(outdir)
|
48
76
|
|
49
77
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
data/lib/mapp2g/version.rb
CHANGED
@@ -0,0 +1,117 @@
|
|
1
|
+
#===
|
2
|
+
# add_annotation_from_uniprot_fasta_to_gff.rb
|
3
|
+
#
|
4
|
+
# This script adds annotation from UniProt FASTA file to GFF file.
|
5
|
+
#
|
6
|
+
# Usage: ruby add_annotation_from_uniprot_fasta_to_gff.rb <uniprot_proteome_referene_fasta> <gff>
|
7
|
+
#
|
8
|
+
# Example: ruby add_annotation_from_uniprot_fasta_to_gff.rb UP000001593_45351.fasta mygff.gff3 > mygff_with_annotation.gff3
|
9
|
+
#
|
10
|
+
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
# https://www.uniprot.org/help/fasta-headers
|
14
|
+
#
|
15
|
+
# UniProtKB
|
16
|
+
#
|
17
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
18
|
+
#
|
19
|
+
# Where:
|
20
|
+
#
|
21
|
+
# db is 'sp' for UniProtKB/Swiss-Prot and 'tr' for UniProtKB/TrEMBL.
|
22
|
+
# UniqueIdentifier is the primary accession number of the UniProtKB entry.
|
23
|
+
# EntryName is the entry name of the UniProtKB entry.
|
24
|
+
# ProteinName is the recommended name of the UniProtKB entry as annotated in the RecName field. For UniProtKB/TrEMBL entries without a RecName field, the SubName field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable.
|
25
|
+
# OrganismName is the scientific name of the organism of the UniProtKB entry.
|
26
|
+
# OrganismIdentifier is the unique identifier of the source organism, assigned by the NCBI.
|
27
|
+
# GeneName is the first gene name of the UniProtKB entry. If there is no gene name, OrderedLocusName or ORFname, the GN field is not listed.
|
28
|
+
# ProteinExistence is the numerical value describing the evidence for the existence of the protein.
|
29
|
+
# SequenceVersion is the version number of the sequence.
|
30
|
+
|
31
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
32
|
+
|
33
|
+
fastaf = ARGV[0]
|
34
|
+
gfff = ARGV[1]
|
35
|
+
data = Hash.new
|
36
|
+
|
37
|
+
|
38
|
+
module Escape
|
39
|
+
|
40
|
+
# ref: https://github.com/bioruby/bioruby/blob/master/lib/bio/db/gff.rb
|
41
|
+
|
42
|
+
# unsafe characters to be escaped for normal columns
|
43
|
+
UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
|
44
|
+
|
45
|
+
# unsafe characters to be escaped for seqid columns
|
46
|
+
# and target_id of the "Target" attribute
|
47
|
+
UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
|
48
|
+
|
49
|
+
# unsafe characters to be escaped for attribute columns
|
50
|
+
UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
|
51
|
+
|
52
|
+
URI_PARSER = URI::Parser.new
|
53
|
+
|
54
|
+
def self.escape_attribute(str)
|
55
|
+
URI_PARSER.escape(str, UNSAFE_ATTRIBUTE)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
File.open(fastaf).each do |l|
|
61
|
+
if /^>/.match(l)
|
62
|
+
# puts l
|
63
|
+
m = /^>(\S+)\|(\S+)\|(\S+)\s+(.+)\s+OS\=(.+)\s+OX\=(\d+)\s+(GN\=(.+)\s+)?PE\=(\d+)\s+SV\=(\d+)/.match(l)
|
64
|
+
db = m[1]
|
65
|
+
id = m[2]
|
66
|
+
entry_name = m[3]
|
67
|
+
protein_name = m[4]
|
68
|
+
organism_name = m[5]
|
69
|
+
organism_id = m[6]
|
70
|
+
gene_name = m[8]
|
71
|
+
prot_exis = m[9]
|
72
|
+
seq_ver = m[10]
|
73
|
+
|
74
|
+
h = {
|
75
|
+
:db => db,
|
76
|
+
:id => id,
|
77
|
+
:entry_name => entry_name,
|
78
|
+
:protein_name => protein_name,
|
79
|
+
:organism_name => organism_name,
|
80
|
+
:organism_id => organism_id,
|
81
|
+
:gene_name => gene_name,
|
82
|
+
:prot_exis => prot_exis,
|
83
|
+
:seq_ver => seq_ver
|
84
|
+
}
|
85
|
+
data[id] = h
|
86
|
+
|
87
|
+
# p [db, id, entry_name, protein_name, organism_name, organism_id]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
File.open(gfff).each do |l|
|
93
|
+
a = l.chomp.split(/\t/)
|
94
|
+
col_type = a[2]
|
95
|
+
col_attr = a[8]
|
96
|
+
if col_type == "match"
|
97
|
+
id = /ID\=(.+)/.match(col_attr)[1]
|
98
|
+
# p id
|
99
|
+
d = data[id]
|
100
|
+
h = Hash.new
|
101
|
+
dbxref = Hash.new
|
102
|
+
h['Name'] = Escape.escape_attribute(d[:protein_name])
|
103
|
+
dbxref['EMBL'] = d[:id]
|
104
|
+
dbxref['Uniprot'] = d[:entry_name]
|
105
|
+
dbxref['tax'] = d[:organism_id]
|
106
|
+
h['Dbxref'] = dbxref.map{|k, v| "#{k}:#{Escape.escape_attribute(v)}"}.join(",")
|
107
|
+
if d[:gene_name]
|
108
|
+
h['Alias'] = [Escape.escape_attribute(d[:gene_name])].join(",")
|
109
|
+
end
|
110
|
+
new_attr_col = col_attr + ";" + h.map{|k, v| "#{k}=#{v}"}.join(";")
|
111
|
+
b = a.dup
|
112
|
+
b[8] = new_attr_col
|
113
|
+
puts b.join("\t")
|
114
|
+
else
|
115
|
+
puts l
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
gff_lines = []
|
2
|
+
|
3
|
+
i = 0
|
4
|
+
name = nil
|
5
|
+
target = nil
|
6
|
+
|
7
|
+
ARGF.each do |l|
|
8
|
+
|
9
|
+
if m = /\s+Query:\s/.match(l)
|
10
|
+
name = m.post_match.chomp.split[0]
|
11
|
+
i+=1
|
12
|
+
STDERR.puts "#{i} records processed" if i % 1000 == 0
|
13
|
+
elsif m = /\s+Target:\s/.match(l)
|
14
|
+
target = m.post_match.split[0]
|
15
|
+
elsif /^#{target}/.match(l) &&
|
16
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
17
|
+
(/\tcds\t/.match(l) || /\texon\t/.match(l) || /\tgene\t/.match(l))
|
18
|
+
# puts l
|
19
|
+
a = l.chomp.split(/\t/)
|
20
|
+
b = Array.new(9)
|
21
|
+
a.each_with_index{|x, i| b[i] = x}
|
22
|
+
if b[2] == "gene"
|
23
|
+
b[-1] = "ID=#{name}"
|
24
|
+
b[2] = "match"
|
25
|
+
elsif (b[2] == "cds" || b[2] == "exon")
|
26
|
+
b[-1] = "Parent=#{name}"
|
27
|
+
b[2] = "match_part"
|
28
|
+
else
|
29
|
+
raise
|
30
|
+
end
|
31
|
+
gff_lines << b.join("\t")
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
puts gff_lines
|
37
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
exonerate_out = (ARGV[0] || "mapp2g_out_ClyHem/mapp2g_out_ClyHem_j609t87/57.exonerate.txt")
|
4
|
+
|
5
|
+
query_name = nil
|
6
|
+
target = nil
|
7
|
+
cigar = nil
|
8
|
+
gff2_lines = []
|
9
|
+
#vulgar = nil
|
10
|
+
|
11
|
+
File.open(exonerate_out, "r").each do |l|
|
12
|
+
|
13
|
+
if m = /\s+Query:\s/.match(l)
|
14
|
+
query_name = m.post_match.chomp.split[0]
|
15
|
+
elsif m = /\s+Target:\s/.match(l)
|
16
|
+
target = m.post_match.split[0]
|
17
|
+
elsif m = /^cigar:\s/.match(l)
|
18
|
+
cigar = m.post_match.chomp
|
19
|
+
elsif /^#{target}/.match(l) &&
|
20
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
21
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
22
|
+
gff2_lines << l.chomp
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
#puts gff2_lines
|
28
|
+
|
29
|
+
gff2_lines.each do |l|
|
30
|
+
a = l.chomp.split(/\t/)
|
31
|
+
b = Array.new(9)
|
32
|
+
a.each_with_index{|x, i| b[i] = x}
|
33
|
+
if b[2] == "gene"
|
34
|
+
b[2] = "match"
|
35
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
36
|
+
# p orig_attribute
|
37
|
+
c = cigar.split(/\s+/)
|
38
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
39
|
+
attribute = {'ID' => query_name,
|
40
|
+
'Target' => [query_name, c[1].to_i + 1, c[2]].join(" "),
|
41
|
+
'Gap'=> cigar_pairs.join(" "),
|
42
|
+
'identity' => orig_attribute['identity'],
|
43
|
+
'similarity' => orig_attribute['similarity']}
|
44
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
45
|
+
elsif b[2] == "exon"
|
46
|
+
b[2] = "match_part"
|
47
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
48
|
+
attribute = {'Parent' => query_name,
|
49
|
+
'identity' => orig_attribute['identity'],
|
50
|
+
'similarity' => orig_attribute['similarity']}
|
51
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
52
|
+
|
53
|
+
else
|
54
|
+
raise
|
55
|
+
end
|
56
|
+
puts b.join("\t")
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# CIGAR format
|
61
|
+
# The format starts with the same 9 fields as sugar output (see above), and is followed by a series of <operation, length> pairs where operation is one of match, insert or delete, and the length describes the number of times this operation is repeated.
|
62
|
+
# 1 query_id: Query identifier
|
63
|
+
# 2 query_start: Query position at alignment start
|
64
|
+
# 3 query_end: Query position alignment end
|
65
|
+
# 4 query_strand: Strand of query matched
|
66
|
+
# 5 target_id|
|
67
|
+
# 6 target_start| the same 4 fields
|
68
|
+
# 7 target_end | for the target sequence
|
69
|
+
# 8 target_strand|
|
70
|
+
# 9 score| The raw alignment score
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|
@@ -30,7 +30,10 @@ files:
|
|
30
30
|
- lib/mapp2g/mapper.rb
|
31
31
|
- lib/mapp2g/version.rb
|
32
32
|
- mapp2g.gemspec
|
33
|
+
- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
|
33
34
|
- scripts/mapp2g-exonerate_gff2_to_jbgff3.rb
|
35
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb
|
36
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb
|
34
37
|
- sig/mapp2g.rbs
|
35
38
|
homepage: https://github.com/shujishigenobu/mapp2g
|
36
39
|
licenses:
|
@@ -39,7 +42,7 @@ metadata:
|
|
39
42
|
homepage_uri: https://github.com/shujishigenobu/mapp2g
|
40
43
|
source_code_uri: https://github.com/shujishigenobu/mapp2g
|
41
44
|
changelog_uri: https://github.com/shujishigenobu/mapp2g
|
42
|
-
post_install_message:
|
45
|
+
post_install_message:
|
43
46
|
rdoc_options: []
|
44
47
|
require_paths:
|
45
48
|
- lib
|
@@ -54,8 +57,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
57
|
- !ruby/object:Gem::Version
|
55
58
|
version: '0'
|
56
59
|
requirements: []
|
57
|
-
rubygems_version: 3.
|
58
|
-
signing_key:
|
60
|
+
rubygems_version: 3.4.10
|
61
|
+
signing_key:
|
59
62
|
specification_version: 4
|
60
63
|
summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
|
61
64
|
way.
|