mapp2g 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
|
4
|
+
data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
|
7
|
+
data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2
|
data/exe/mapp2g
CHANGED
@@ -5,6 +5,7 @@ require 'bio'
|
|
5
5
|
require 'tempfile'
|
6
6
|
require 'optparse'
|
7
7
|
|
8
|
+
### Parse options
|
8
9
|
|
9
10
|
opt = OptionParser.new
|
10
11
|
OPTS = {}
|
@@ -30,7 +31,6 @@ begin
|
|
30
31
|
|
31
32
|
opt.parse!(ARGV)
|
32
33
|
|
33
|
-
|
34
34
|
rescue => e
|
35
35
|
puts "ERROR: #{e}\nSee #{opt}"
|
36
36
|
exit
|
@@ -44,6 +44,34 @@ outdir = (OPTS[:o] || "mapp2g_out_#{$$}")
|
|
44
44
|
|
45
45
|
#p [query, genome, outdir]
|
46
46
|
|
47
|
+
### Check environment
|
48
|
+
|
49
|
+
def command?(name)
|
50
|
+
`which #{name}`
|
51
|
+
unless $?.success?
|
52
|
+
raise "#{name} command not found"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
begin
|
57
|
+
command?("exonerate")
|
58
|
+
command?("blastn")
|
59
|
+
rescue => e
|
60
|
+
puts "ERROR: #{e}"
|
61
|
+
exit(1)
|
62
|
+
end
|
63
|
+
|
64
|
+
begin
|
65
|
+
unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
|
66
|
+
raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
|
67
|
+
end
|
68
|
+
rescue => e
|
69
|
+
puts "ERROR: #{e}"
|
70
|
+
exit(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
### Main
|
74
|
+
|
47
75
|
Dir.mkdir(outdir)
|
48
76
|
|
49
77
|
Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
|
data/lib/mapp2g/version.rb
CHANGED
@@ -0,0 +1,117 @@
|
|
1
|
+
#===
|
2
|
+
# add_annotation_from_uniprot_fasta_to_gff.rb
|
3
|
+
#
|
4
|
+
# This script adds annotation from UniProt FASTA file to GFF file.
|
5
|
+
#
|
6
|
+
# Usage: ruby add_annotation_from_uniprot_fasta_to_gff.rb <uniprot_proteome_referene_fasta> <gff>
|
7
|
+
#
|
8
|
+
# Example: ruby add_annotation_from_uniprot_fasta_to_gff.rb UP000001593_45351.fasta mygff.gff3 > mygff_with_annotation.gff3
|
9
|
+
#
|
10
|
+
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
# https://www.uniprot.org/help/fasta-headers
|
14
|
+
#
|
15
|
+
# UniProtKB
|
16
|
+
#
|
17
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
18
|
+
#
|
19
|
+
# Where:
|
20
|
+
#
|
21
|
+
# db is 'sp' for UniProtKB/Swiss-Prot and 'tr' for UniProtKB/TrEMBL.
|
22
|
+
# UniqueIdentifier is the primary accession number of the UniProtKB entry.
|
23
|
+
# EntryName is the entry name of the UniProtKB entry.
|
24
|
+
# ProteinName is the recommended name of the UniProtKB entry as annotated in the RecName field. For UniProtKB/TrEMBL entries without a RecName field, the SubName field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable.
|
25
|
+
# OrganismName is the scientific name of the organism of the UniProtKB entry.
|
26
|
+
# OrganismIdentifier is the unique identifier of the source organism, assigned by the NCBI.
|
27
|
+
# GeneName is the first gene name of the UniProtKB entry. If there is no gene name, OrderedLocusName or ORFname, the GN field is not listed.
|
28
|
+
# ProteinExistence is the numerical value describing the evidence for the existence of the protein.
|
29
|
+
# SequenceVersion is the version number of the sequence.
|
30
|
+
|
31
|
+
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
|
32
|
+
|
33
|
+
fastaf = ARGV[0]
|
34
|
+
gfff = ARGV[1]
|
35
|
+
data = Hash.new
|
36
|
+
|
37
|
+
|
38
|
+
module Escape
|
39
|
+
|
40
|
+
# ref: https://github.com/bioruby/bioruby/blob/master/lib/bio/db/gff.rb
|
41
|
+
|
42
|
+
# unsafe characters to be escaped for normal columns
|
43
|
+
UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
|
44
|
+
|
45
|
+
# unsafe characters to be escaped for seqid columns
|
46
|
+
# and target_id of the "Target" attribute
|
47
|
+
UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
|
48
|
+
|
49
|
+
# unsafe characters to be escaped for attribute columns
|
50
|
+
UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
|
51
|
+
|
52
|
+
URI_PARSER = URI::Parser.new
|
53
|
+
|
54
|
+
def self.escape_attribute(str)
|
55
|
+
URI_PARSER.escape(str, UNSAFE_ATTRIBUTE)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
File.open(fastaf).each do |l|
|
61
|
+
if /^>/.match(l)
|
62
|
+
# puts l
|
63
|
+
m = /^>(\S+)\|(\S+)\|(\S+)\s+(.+)\s+OS\=(.+)\s+OX\=(\d+)\s+(GN\=(.+)\s+)?PE\=(\d+)\s+SV\=(\d+)/.match(l)
|
64
|
+
db = m[1]
|
65
|
+
id = m[2]
|
66
|
+
entry_name = m[3]
|
67
|
+
protein_name = m[4]
|
68
|
+
organism_name = m[5]
|
69
|
+
organism_id = m[6]
|
70
|
+
gene_name = m[8]
|
71
|
+
prot_exis = m[9]
|
72
|
+
seq_ver = m[10]
|
73
|
+
|
74
|
+
h = {
|
75
|
+
:db => db,
|
76
|
+
:id => id,
|
77
|
+
:entry_name => entry_name,
|
78
|
+
:protein_name => protein_name,
|
79
|
+
:organism_name => organism_name,
|
80
|
+
:organism_id => organism_id,
|
81
|
+
:gene_name => gene_name,
|
82
|
+
:prot_exis => prot_exis,
|
83
|
+
:seq_ver => seq_ver
|
84
|
+
}
|
85
|
+
data[id] = h
|
86
|
+
|
87
|
+
# p [db, id, entry_name, protein_name, organism_name, organism_id]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
File.open(gfff).each do |l|
|
93
|
+
a = l.chomp.split(/\t/)
|
94
|
+
col_type = a[2]
|
95
|
+
col_attr = a[8]
|
96
|
+
if col_type == "match"
|
97
|
+
id = /ID\=(.+)/.match(col_attr)[1]
|
98
|
+
# p id
|
99
|
+
d = data[id]
|
100
|
+
h = Hash.new
|
101
|
+
dbxref = Hash.new
|
102
|
+
h['Name'] = Escape.escape_attribute(d[:protein_name])
|
103
|
+
dbxref['EMBL'] = d[:id]
|
104
|
+
dbxref['Uniprot'] = d[:entry_name]
|
105
|
+
dbxref['tax'] = d[:organism_id]
|
106
|
+
h['Dbxref'] = dbxref.map{|k, v| "#{k}:#{Escape.escape_attribute(v)}"}.join(",")
|
107
|
+
if d[:gene_name]
|
108
|
+
h['Alias'] = [Escape.escape_attribute(d[:gene_name])].join(",")
|
109
|
+
end
|
110
|
+
new_attr_col = col_attr + ";" + h.map{|k, v| "#{k}=#{v}"}.join(";")
|
111
|
+
b = a.dup
|
112
|
+
b[8] = new_attr_col
|
113
|
+
puts b.join("\t")
|
114
|
+
else
|
115
|
+
puts l
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
gff_lines = []
|
2
|
+
|
3
|
+
i = 0
|
4
|
+
name = nil
|
5
|
+
target = nil
|
6
|
+
|
7
|
+
ARGF.each do |l|
|
8
|
+
|
9
|
+
if m = /\s+Query:\s/.match(l)
|
10
|
+
name = m.post_match.chomp.split[0]
|
11
|
+
i+=1
|
12
|
+
STDERR.puts "#{i} records processed" if i % 1000 == 0
|
13
|
+
elsif m = /\s+Target:\s/.match(l)
|
14
|
+
target = m.post_match.split[0]
|
15
|
+
elsif /^#{target}/.match(l) &&
|
16
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
17
|
+
(/\tcds\t/.match(l) || /\texon\t/.match(l) || /\tgene\t/.match(l))
|
18
|
+
# puts l
|
19
|
+
a = l.chomp.split(/\t/)
|
20
|
+
b = Array.new(9)
|
21
|
+
a.each_with_index{|x, i| b[i] = x}
|
22
|
+
if b[2] == "gene"
|
23
|
+
b[-1] = "ID=#{name}"
|
24
|
+
b[2] = "match"
|
25
|
+
elsif (b[2] == "cds" || b[2] == "exon")
|
26
|
+
b[-1] = "Parent=#{name}"
|
27
|
+
b[2] = "match_part"
|
28
|
+
else
|
29
|
+
raise
|
30
|
+
end
|
31
|
+
gff_lines << b.join("\t")
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
puts gff_lines
|
37
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
exonerate_out = (ARGV[0] || "mapp2g_out_ClyHem/mapp2g_out_ClyHem_j609t87/57.exonerate.txt")
|
4
|
+
|
5
|
+
query_name = nil
|
6
|
+
target = nil
|
7
|
+
cigar = nil
|
8
|
+
gff2_lines = []
|
9
|
+
#vulgar = nil
|
10
|
+
|
11
|
+
File.open(exonerate_out, "r").each do |l|
|
12
|
+
|
13
|
+
if m = /\s+Query:\s/.match(l)
|
14
|
+
query_name = m.post_match.chomp.split[0]
|
15
|
+
elsif m = /\s+Target:\s/.match(l)
|
16
|
+
target = m.post_match.split[0]
|
17
|
+
elsif m = /^cigar:\s/.match(l)
|
18
|
+
cigar = m.post_match.chomp
|
19
|
+
elsif /^#{target}/.match(l) &&
|
20
|
+
(/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
|
21
|
+
(/\texon\t/.match(l) || /\tgene\t/.match(l))
|
22
|
+
gff2_lines << l.chomp
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
#puts gff2_lines
|
28
|
+
|
29
|
+
gff2_lines.each do |l|
|
30
|
+
a = l.chomp.split(/\t/)
|
31
|
+
b = Array.new(9)
|
32
|
+
a.each_with_index{|x, i| b[i] = x}
|
33
|
+
if b[2] == "gene"
|
34
|
+
b[2] = "match"
|
35
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
36
|
+
# p orig_attribute
|
37
|
+
c = cigar.split(/\s+/)
|
38
|
+
cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
|
39
|
+
attribute = {'ID' => query_name,
|
40
|
+
'Target' => [query_name, c[1].to_i + 1, c[2]].join(" "),
|
41
|
+
'Gap'=> cigar_pairs.join(" "),
|
42
|
+
'identity' => orig_attribute['identity'],
|
43
|
+
'similarity' => orig_attribute['similarity']}
|
44
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
45
|
+
elsif b[2] == "exon"
|
46
|
+
b[2] = "match_part"
|
47
|
+
orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
|
48
|
+
attribute = {'Parent' => query_name,
|
49
|
+
'identity' => orig_attribute['identity'],
|
50
|
+
'similarity' => orig_attribute['similarity']}
|
51
|
+
b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
|
52
|
+
|
53
|
+
else
|
54
|
+
raise
|
55
|
+
end
|
56
|
+
puts b.join("\t")
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# CIGAR format
|
61
|
+
# The format starts with the same 9 fields as sugar output (see above), and is followed by a series of <operation, length> pairs where operation is one of match, insert or delete, and the length describes the number of times this operation is repeated.
|
62
|
+
# 1 query_id: Query identifier
|
63
|
+
# 2 query_start: Query position at alignment start
|
64
|
+
# 3 query_end: Query position alignment end
|
65
|
+
# 4 query_strand: Strand of query matched
|
66
|
+
# 5 target_id|
|
67
|
+
# 6 target_start| the same 4 fields
|
68
|
+
# 7 target_end | for the target sequence
|
69
|
+
# 8 target_strand|
|
70
|
+
# 9 score| The raw alignment score
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mapp2g
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: mapp2g is a bioinformatics software, which map and align protein sequences
|
14
14
|
(amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
|
@@ -30,7 +30,10 @@ files:
|
|
30
30
|
- lib/mapp2g/mapper.rb
|
31
31
|
- lib/mapp2g/version.rb
|
32
32
|
- mapp2g.gemspec
|
33
|
+
- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
|
33
34
|
- scripts/mapp2g-exonerate_gff2_to_jbgff3.rb
|
35
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb
|
36
|
+
- scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb
|
34
37
|
- sig/mapp2g.rbs
|
35
38
|
homepage: https://github.com/shujishigenobu/mapp2g
|
36
39
|
licenses:
|
@@ -39,7 +42,7 @@ metadata:
|
|
39
42
|
homepage_uri: https://github.com/shujishigenobu/mapp2g
|
40
43
|
source_code_uri: https://github.com/shujishigenobu/mapp2g
|
41
44
|
changelog_uri: https://github.com/shujishigenobu/mapp2g
|
42
|
-
post_install_message:
|
45
|
+
post_install_message:
|
43
46
|
rdoc_options: []
|
44
47
|
require_paths:
|
45
48
|
- lib
|
@@ -54,8 +57,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
57
|
- !ruby/object:Gem::Version
|
55
58
|
version: '0'
|
56
59
|
requirements: []
|
57
|
-
rubygems_version: 3.
|
58
|
-
signing_key:
|
60
|
+
rubygems_version: 3.4.10
|
61
|
+
signing_key:
|
59
62
|
specification_version: 4
|
60
63
|
summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
|
61
64
|
way.
|