protk 1.3.0 → 1.3.1.pre2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -2
- data/bin/mascot_search.rb +2 -0
- data/bin/msgfplus_search.rb +1 -1
- data/bin/protxml_to_gff.rb +94 -115
- data/bin/protxml_to_psql.rb +3 -2
- data/bin/sixframe.rb +15 -8
- data/bin/swissprot_to_table.rb +120 -0
- data/lib/protk.rb +0 -1
- data/lib/protk/bio_gff3_extensions.rb +22 -0
- data/lib/protk/bio_sptr_extensions.rb +19 -4
- data/lib/protk/constants.rb +19 -11
- data/lib/protk/gffdb.rb +60 -0
- data/lib/protk/peptide.rb +158 -0
- data/lib/protk/protein.rb +72 -0
- data/lib/protk/protein_to_genome_mapper.rb +8 -0
- data/lib/protk/protxml_to_gff_tool.rb +3 -1
- data/lib/protk/search_tool.rb +3 -24
- data/lib/protk/swissprot_database.rb +8 -20
- data/lib/protk/tool.rb +36 -1
- metadata +68 -41
- data/lib/protk/protxml.rb +0 -141
data/lib/protk.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'bio'
|
2
|
+
|
3
|
+
# Extension to GFF3 records to support genomic coordinate mapping tasks
|
4
|
+
|
5
|
+
class Bio::GFF::GFF3::Record
|
6
|
+
|
7
|
+
|
8
|
+
# Comparator to allow sorting by start
|
9
|
+
|
10
|
+
# Overlap operator to return a new gff by overlapping this one with another
|
11
|
+
|
12
|
+
# Function to return our coordinates relative to some other coordinate system (eg a protein)
|
13
|
+
|
14
|
+
def <=>(otherRecord)
|
15
|
+
self.start <=> otherRecord.start
|
16
|
+
end
|
17
|
+
|
18
|
+
def length
|
19
|
+
return self.end-self.start+1
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -90,7 +90,7 @@ class Bio::SPTR < Bio::EMBLDB
|
|
90
90
|
# SwissProt Accessions
|
91
91
|
#
|
92
92
|
def accessions
|
93
|
-
return
|
93
|
+
return self.ac
|
94
94
|
end
|
95
95
|
|
96
96
|
# Subcellular Location
|
@@ -132,7 +132,8 @@ class Bio::SPTR < Bio::EMBLDB
|
|
132
132
|
def domain
|
133
133
|
return self.cc["DOMAIN"].to_s
|
134
134
|
end
|
135
|
-
|
135
|
+
|
136
|
+
|
136
137
|
#
|
137
138
|
# Getting dr entry
|
138
139
|
#
|
@@ -152,6 +153,20 @@ class Bio::SPTR < Bio::EMBLDB
|
|
152
153
|
def ipi
|
153
154
|
return self.safely_get_drentry_for_key("IPI")
|
154
155
|
end
|
156
|
+
|
157
|
+
def go_terms
|
158
|
+
terms = self.dr["GO"]
|
159
|
+
if terms
|
160
|
+
return terms.collect { |e| e[0] }
|
161
|
+
else
|
162
|
+
return nil
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def go_entries
|
167
|
+
return self.dr["GO"]
|
168
|
+
end
|
169
|
+
|
155
170
|
|
156
171
|
# Intact accession number
|
157
172
|
#
|
@@ -234,8 +249,8 @@ class Bio::SPTR < Bio::EMBLDB
|
|
234
249
|
return self.seq.to_s
|
235
250
|
end
|
236
251
|
|
237
|
-
def
|
238
|
-
return self.ox
|
252
|
+
def ncbi_taxon_id
|
253
|
+
return self.ox["NCBI_TaxID"]
|
239
254
|
end
|
240
255
|
|
241
256
|
def species_dump
|
data/lib/protk/constants.rb
CHANGED
@@ -29,16 +29,19 @@ class Constants
|
|
29
29
|
attr :info_level
|
30
30
|
attr :protk_dir
|
31
31
|
attr :data_lib_dir
|
32
|
+
attr_accessor :info_level
|
32
33
|
|
33
34
|
# Provides direct access to constants through methods of the same name
|
34
35
|
# This will be used for all constants other than paths
|
35
36
|
#
|
36
37
|
def method_missing(method)
|
38
|
+
|
37
39
|
from_env = @env[method.to_s]
|
38
40
|
throw "#{method} is undefined" unless from_env!=nil
|
39
41
|
from_env
|
40
42
|
end
|
41
43
|
|
44
|
+
|
42
45
|
# Some constants are paths. They need to be translated into real paths before being returned
|
43
46
|
#
|
44
47
|
|
@@ -121,7 +124,7 @@ class Constants
|
|
121
124
|
# Read the global constants file and initialize our class @env variable
|
122
125
|
# Initialize loggers
|
123
126
|
#
|
124
|
-
def initialize
|
127
|
+
def initialize()
|
125
128
|
|
126
129
|
@data_lib_dir="#{File.dirname(__FILE__)}/data"
|
127
130
|
@protk_dir="#{Dir.home}/.protk"
|
@@ -170,8 +173,10 @@ class Constants
|
|
170
173
|
|
171
174
|
# puts "Path #{ENV['PATH']}"
|
172
175
|
throw "No data found in config file" unless @env!=nil
|
173
|
-
|
174
|
-
|
176
|
+
|
177
|
+
@info_level="fatal"
|
178
|
+
@info_level=default_config_yml['message_level'] unless default_config_yml['message_level'].nil?
|
179
|
+
|
175
180
|
end
|
176
181
|
|
177
182
|
|
@@ -196,15 +201,17 @@ class Constants
|
|
196
201
|
throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
|
197
202
|
throw "Unable to create stdout logger " unless @stdout_logger!=nil
|
198
203
|
|
199
|
-
|
200
|
-
|
201
204
|
case @info_level
|
202
|
-
when
|
205
|
+
when /info/i
|
203
206
|
@stdout_logger.level=Logger::INFO
|
204
|
-
when
|
207
|
+
when /debug/i
|
205
208
|
@stdout_logger.level=Logger::DEBUG
|
206
|
-
when
|
207
|
-
@stdout_logger.level=Logger::WARN
|
209
|
+
when /warn/i
|
210
|
+
@stdout_logger.level=Logger::WARN
|
211
|
+
when /fatal/i
|
212
|
+
@stdout_logger.level=Logger::FATAL
|
213
|
+
else
|
214
|
+
throw "Unknown log level #{@info_level}"
|
208
215
|
end
|
209
216
|
|
210
217
|
end
|
@@ -215,8 +222,9 @@ class Constants
|
|
215
222
|
if ( @stdout_logger == nil || @file_logger == nil)
|
216
223
|
initialize_loggers
|
217
224
|
end
|
218
|
-
|
219
|
-
|
225
|
+
|
226
|
+
@stdout_logger.send(level,message)
|
227
|
+
@file_logger.send(level,message)
|
220
228
|
end
|
221
229
|
|
222
230
|
def path_for_builtin_database(dbname)
|
data/lib/protk/gffdb.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'protk/constants'
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
|
5
|
+
class GFFDB
|
6
|
+
|
7
|
+
attr_accessor :id_to_records_map
|
8
|
+
|
9
|
+
def initialize(gff_file_path)
|
10
|
+
env = Constants.new
|
11
|
+
@database = gff_file_path
|
12
|
+
@id_to_records_map={}
|
13
|
+
@id_to_cds_map={}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.create(gff_file_path)
|
17
|
+
db = GFFDB.new(gff_file_path)
|
18
|
+
db.make_index(gff_file_path)
|
19
|
+
db
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_by_id(entry_id)
|
23
|
+
@id_to_records_map[entry_id]
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_cds_by_parent_id(entry_id)
|
27
|
+
@id_to_cds_map[entry_id]
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def make_index(input_gff)
|
32
|
+
io = File.open(input_gff, "r")
|
33
|
+
gffdb = Bio::GFF::GFF3.new(io) #parses the entire db
|
34
|
+
|
35
|
+
# Now create the mapping from ids to records
|
36
|
+
gffdb.records.each do |record|
|
37
|
+
|
38
|
+
@id_to_records_map[record.id] = [] if @id_to_records_map[record.id].nil?
|
39
|
+
@id_to_records_map[record.id] << record
|
40
|
+
|
41
|
+
begin
|
42
|
+
# puts record.feature_type.match(/CDS/)
|
43
|
+
if record.feature_type.to_s =~ /CDS/i
|
44
|
+
# puts record.feature_type
|
45
|
+
parent_id=record.attributes_to_hash['Parent']
|
46
|
+
# puts parent_id
|
47
|
+
if parent_id
|
48
|
+
@id_to_cds_map[parent_id] = [] if @id_to_cds_map[parent_id].nil?
|
49
|
+
@id_to_cds_map[parent_id] << record
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
rescue
|
54
|
+
puts "Problem initializing cds map for #{record}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
require 'bio'
|
3
|
+
require 'protk/bio_gff3_extensions'
|
4
|
+
include LibXML
|
5
|
+
|
6
|
+
class PeptideNotInProteinError < StandardError
|
7
|
+
end
|
8
|
+
|
9
|
+
class Peptide
|
10
|
+
|
11
|
+
attr_accessor :sequence
|
12
|
+
attr_accessor :protein_name
|
13
|
+
attr_accessor :charge
|
14
|
+
attr_accessor :nsp_adjusted_probability
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
class << self
|
19
|
+
def from_protxml(xmlnode)
|
20
|
+
pep=new()
|
21
|
+
pep.sequence=xmlnode['peptide_sequence']
|
22
|
+
pep.nsp_adjusted_probability=xmlnode['nsp_adjusted_probability'].to_f
|
23
|
+
pep.charge=xmlnode['charge'].to_i
|
24
|
+
pep
|
25
|
+
end
|
26
|
+
|
27
|
+
def from_sequence(seq,charge=nil)
|
28
|
+
pep=new()
|
29
|
+
pep.sequence=seq
|
30
|
+
pep.charge=charge
|
31
|
+
pep
|
32
|
+
end
|
33
|
+
private :new
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize()
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# Expects prot_seq not to contain explicit stop codon (ie * at end)
|
41
|
+
# AA coords are 0-based unlike genomic coords which are 1 based
|
42
|
+
#
|
43
|
+
def coords_in_protein(prot_seq,reverse=false)
|
44
|
+
if reverse
|
45
|
+
pep_index = prot_seq.reverse.index(self.sequence.reverse)
|
46
|
+
raise PeptideNotInProteinError if pep_index.nil?
|
47
|
+
pep_start_i = pep_index
|
48
|
+
else
|
49
|
+
pep_start_i = prot_seq.index(self.sequence)
|
50
|
+
raise PeptideNotInProteinError if pep_start_i.nil?
|
51
|
+
end
|
52
|
+
pep_end_i = pep_start_i+self.sequence.length
|
53
|
+
{:start => pep_start_i,:end => pep_end_i}
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates
|
58
|
+
#
|
59
|
+
# Assumes that cds_coords is inclusive of the entire protein sequence including start-met
|
60
|
+
#
|
61
|
+
# We assume that gff records conform to the spec
|
62
|
+
#
|
63
|
+
# http://www.sequenceontology.org/gff3.shtml
|
64
|
+
#
|
65
|
+
# This part of the spec is crucial
|
66
|
+
#
|
67
|
+
# - The START and STOP codons are included in the CDS.
|
68
|
+
# - That is, if the locations of the start and stop codons are known,
|
69
|
+
# - the first three base pairs of the CDS should correspond to the start codon
|
70
|
+
# - and the last three correspond the stop codon.
|
71
|
+
#
|
72
|
+
# We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)
|
73
|
+
#
|
74
|
+
def to_gff3_records(prot_seq,parent_record,cds_records)
|
75
|
+
|
76
|
+
throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
|
77
|
+
throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
|
78
|
+
|
79
|
+
on_reverse_strand = (parent_record.strand=="-") ? true : false
|
80
|
+
aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
|
81
|
+
|
82
|
+
ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
|
83
|
+
|
84
|
+
# Initial position is the number of NA's from the start of translation
|
85
|
+
#
|
86
|
+
pep_nalen = self.sequence.length*3
|
87
|
+
|
88
|
+
i = 0; #Current protein position (in nucleic acids)
|
89
|
+
|
90
|
+
pep_start_i = aa_coords[:start]*3
|
91
|
+
pep_end_i = pep_start_i+self.sequence.length*3
|
92
|
+
fragments=[]
|
93
|
+
ordered_cds_records.each do |cds_record|
|
94
|
+
# puts cds_record
|
95
|
+
fragment = nil
|
96
|
+
fragment_len = 0
|
97
|
+
if on_reverse_strand
|
98
|
+
|
99
|
+
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
100
|
+
before_len = [pep_start_i-i,0].max
|
101
|
+
# puts before_len
|
102
|
+
# puts in_peptide
|
103
|
+
# puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
|
104
|
+
if in_peptide
|
105
|
+
|
106
|
+
fragment_end = cds_record.end
|
107
|
+
fragment_len = [cds_record.length,pep_end_i-i].min
|
108
|
+
fragment_start = fragment_end-fragment_len+1
|
109
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
110
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
111
|
+
|
112
|
+
elsif before_len>0
|
113
|
+
fragment_end = cds_record.end - before_len
|
114
|
+
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
115
|
+
# puts "Frag len #{fragment_len}"
|
116
|
+
fragment_start = fragment_end - fragment_len + 1
|
117
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
118
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
119
|
+
else
|
120
|
+
fragment=nil
|
121
|
+
end
|
122
|
+
else
|
123
|
+
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
124
|
+
before_len = [pep_start_i-i,0].max
|
125
|
+
if in_peptide
|
126
|
+
fragment_start = cds_record.start
|
127
|
+
fragment_len = [cds_record.length,pep_end_i-i].min
|
128
|
+
fragment_end = fragment_start+fragment_len-1
|
129
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
130
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
131
|
+
elsif before_len>0
|
132
|
+
fragment_start = cds_record.start + before_len
|
133
|
+
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
134
|
+
fragment_end = fragment_start + fragment_len-1
|
135
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
136
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
137
|
+
else
|
138
|
+
fragment=nil
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
i+=cds_record.length
|
143
|
+
fragments << fragment unless fragment.nil?
|
144
|
+
end
|
145
|
+
fragments
|
146
|
+
end
|
147
|
+
|
148
|
+
def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
|
149
|
+
cds_id = parent_record.id
|
150
|
+
this_id = "#{cds_id}.#{self.sequence}"
|
151
|
+
this_id << ".#{self.charge}" unless self.charge.nil?
|
152
|
+
score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
|
153
|
+
gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
154
|
+
Bio::GFF::GFF3::Record.new(gff_string)
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'protk/peptide'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
|
6
|
+
class Protein
|
7
|
+
|
8
|
+
attr_accessor :group_number
|
9
|
+
attr_accessor :group_probability
|
10
|
+
attr_accessor :probability
|
11
|
+
attr_accessor :sequence
|
12
|
+
attr_accessor :protein_name
|
13
|
+
attr_accessor :n_indistinguishable_proteins
|
14
|
+
attr_accessor :percent_coverage
|
15
|
+
attr_accessor :peptides
|
16
|
+
|
17
|
+
class << self
|
18
|
+
|
19
|
+
# <protein_group group_number="1" probability="1.0000">
|
20
|
+
# <protein protein_name="ACADV_MOUSE" n_indistinguishable_proteins="1" probability="1.0000" percent_coverage="9.9" unique_stripped_peptides="ELGAFGLQVPSELGGLGLSNTQYAR+GIVNEQFLLQR+SGELAVQALDQFATVVEAK+VAVNILNNGR" group_sibling_id="a" total_number_peptides="4" pct_spectrum_ids="0.41" confidence="1.00">
|
21
|
+
# <parameter name="prot_length" value="656"/>
|
22
|
+
# <annotation protein_description="Very long-chain specific acyl-CoA dehydrogenase, mitochondrial OS=Mus musculus GN=Acadvl PE=1 SV=3"/>
|
23
|
+
# <peptide peptide_sequence="SGELAVQALDQFATVVEAK" charge="1" initial_probability="0.9919" nsp_adjusted_probability="0.9981" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1975.0340">
|
24
|
+
# </peptide>
|
25
|
+
# <peptide peptide_sequence="GIVNEQFLLQR" charge="1" initial_probability="0.9909" nsp_adjusted_probability="0.9979" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1315.7250">
|
26
|
+
# </peptide>
|
27
|
+
# <peptide peptide_sequence="ELGAFGLQVPSELGGLGLSNTQYAR" charge="1" initial_probability="0.7792" nsp_adjusted_probability="0.9391" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.55" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.78" is_contributing_evidence="Y" calc_neutral_pep_mass="2576.3234">
|
28
|
+
# </peptide>
|
29
|
+
# <peptide peptide_sequence="VAVNILNNGR" charge="1" initial_probability="0.5674" nsp_adjusted_probability="0.8515" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.76" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.57" is_contributing_evidence="Y" calc_neutral_pep_mass="1068.6030">
|
30
|
+
# </peptide>
|
31
|
+
# </protein>
|
32
|
+
# </protein_group>
|
33
|
+
|
34
|
+
|
35
|
+
def from_protxml(xmlnode)
|
36
|
+
prot=new()
|
37
|
+
groupnode = xmlnode.parent
|
38
|
+
prot.group_probability = groupnode['probability'].to_f
|
39
|
+
prot.group_number = groupnode['group_number'].to_i
|
40
|
+
prot.probability = xmlnode['probability'].to_f
|
41
|
+
prot.protein_name = xmlnode['protein_name']
|
42
|
+
prot.n_indistinguishable_proteins = xmlnode['n_indistinguishable_proteins'].to_i
|
43
|
+
prot.percent_coverage = xmlnode['percent_coverage'].to_f
|
44
|
+
|
45
|
+
peptide_nodes = xmlnode.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
46
|
+
prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
|
47
|
+
prot
|
48
|
+
end
|
49
|
+
private :new
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize()
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return just one peptide for each unique sequence choosing the peptide with highest probability
|
57
|
+
#
|
58
|
+
def representative_peptides()
|
59
|
+
best_peptides={}
|
60
|
+
self.peptides.each do |peptide|
|
61
|
+
seq = peptide.sequence
|
62
|
+
if best_peptides[seq].nil?
|
63
|
+
best_peptides[seq]=peptide
|
64
|
+
else
|
65
|
+
best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
best_peptides.values
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|