protk 1.3.0 → 1.3.1.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -2
- data/bin/mascot_search.rb +2 -0
- data/bin/msgfplus_search.rb +1 -1
- data/bin/protxml_to_gff.rb +94 -115
- data/bin/protxml_to_psql.rb +3 -2
- data/bin/sixframe.rb +15 -8
- data/bin/swissprot_to_table.rb +120 -0
- data/lib/protk.rb +0 -1
- data/lib/protk/bio_gff3_extensions.rb +22 -0
- data/lib/protk/bio_sptr_extensions.rb +19 -4
- data/lib/protk/constants.rb +19 -11
- data/lib/protk/gffdb.rb +60 -0
- data/lib/protk/peptide.rb +158 -0
- data/lib/protk/protein.rb +72 -0
- data/lib/protk/protein_to_genome_mapper.rb +8 -0
- data/lib/protk/protxml_to_gff_tool.rb +3 -1
- data/lib/protk/search_tool.rb +3 -24
- data/lib/protk/swissprot_database.rb +8 -20
- data/lib/protk/tool.rb +36 -1
- metadata +68 -41
- data/lib/protk/protxml.rb +0 -141
data/lib/protk.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'bio'
|
2
|
+
|
3
|
+
# Extension to GFF3 records to support genomic coordinate mapping tasks
|
4
|
+
|
5
|
+
class Bio::GFF::GFF3::Record
|
6
|
+
|
7
|
+
|
8
|
+
# Comparator to allow sorting by start
|
9
|
+
|
10
|
+
# Overlap operator to return a new gff by overlapping this one with another
|
11
|
+
|
12
|
+
# Function to return our coordinates relative to some other coordinate system (eg a protein)
|
13
|
+
|
14
|
+
def <=>(otherRecord)
|
15
|
+
self.start <=> otherRecord.start
|
16
|
+
end
|
17
|
+
|
18
|
+
def length
|
19
|
+
return self.end-self.start+1
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -90,7 +90,7 @@ class Bio::SPTR < Bio::EMBLDB
|
|
90
90
|
# SwissProt Accessions
|
91
91
|
#
|
92
92
|
def accessions
|
93
|
-
return
|
93
|
+
return self.ac
|
94
94
|
end
|
95
95
|
|
96
96
|
# Subcellular Location
|
@@ -132,7 +132,8 @@ class Bio::SPTR < Bio::EMBLDB
|
|
132
132
|
def domain
|
133
133
|
return self.cc["DOMAIN"].to_s
|
134
134
|
end
|
135
|
-
|
135
|
+
|
136
|
+
|
136
137
|
#
|
137
138
|
# Getting dr entry
|
138
139
|
#
|
@@ -152,6 +153,20 @@ class Bio::SPTR < Bio::EMBLDB
|
|
152
153
|
def ipi
|
153
154
|
return self.safely_get_drentry_for_key("IPI")
|
154
155
|
end
|
156
|
+
|
157
|
+
def go_terms
|
158
|
+
terms = self.dr["GO"]
|
159
|
+
if terms
|
160
|
+
return terms.collect { |e| e[0] }
|
161
|
+
else
|
162
|
+
return nil
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def go_entries
|
167
|
+
return self.dr["GO"]
|
168
|
+
end
|
169
|
+
|
155
170
|
|
156
171
|
# Intact accession number
|
157
172
|
#
|
@@ -234,8 +249,8 @@ class Bio::SPTR < Bio::EMBLDB
|
|
234
249
|
return self.seq.to_s
|
235
250
|
end
|
236
251
|
|
237
|
-
def
|
238
|
-
return self.ox
|
252
|
+
def ncbi_taxon_id
|
253
|
+
return self.ox["NCBI_TaxID"]
|
239
254
|
end
|
240
255
|
|
241
256
|
def species_dump
|
data/lib/protk/constants.rb
CHANGED
@@ -29,16 +29,19 @@ class Constants
|
|
29
29
|
attr :info_level
|
30
30
|
attr :protk_dir
|
31
31
|
attr :data_lib_dir
|
32
|
+
attr_accessor :info_level
|
32
33
|
|
33
34
|
# Provides direct access to constants through methods of the same name
|
34
35
|
# This will be used for all constants other than paths
|
35
36
|
#
|
36
37
|
def method_missing(method)
|
38
|
+
|
37
39
|
from_env = @env[method.to_s]
|
38
40
|
throw "#{method} is undefined" unless from_env!=nil
|
39
41
|
from_env
|
40
42
|
end
|
41
43
|
|
44
|
+
|
42
45
|
# Some constants are paths. They need to be translated into real paths before being returned
|
43
46
|
#
|
44
47
|
|
@@ -121,7 +124,7 @@ class Constants
|
|
121
124
|
# Read the global constants file and initialize our class @env variable
|
122
125
|
# Initialize loggers
|
123
126
|
#
|
124
|
-
def initialize
|
127
|
+
def initialize()
|
125
128
|
|
126
129
|
@data_lib_dir="#{File.dirname(__FILE__)}/data"
|
127
130
|
@protk_dir="#{Dir.home}/.protk"
|
@@ -170,8 +173,10 @@ class Constants
|
|
170
173
|
|
171
174
|
# puts "Path #{ENV['PATH']}"
|
172
175
|
throw "No data found in config file" unless @env!=nil
|
173
|
-
|
174
|
-
|
176
|
+
|
177
|
+
@info_level="fatal"
|
178
|
+
@info_level=default_config_yml['message_level'] unless default_config_yml['message_level'].nil?
|
179
|
+
|
175
180
|
end
|
176
181
|
|
177
182
|
|
@@ -196,15 +201,17 @@ class Constants
|
|
196
201
|
throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
|
197
202
|
throw "Unable to create stdout logger " unless @stdout_logger!=nil
|
198
203
|
|
199
|
-
|
200
|
-
|
201
204
|
case @info_level
|
202
|
-
when
|
205
|
+
when /info/i
|
203
206
|
@stdout_logger.level=Logger::INFO
|
204
|
-
when
|
207
|
+
when /debug/i
|
205
208
|
@stdout_logger.level=Logger::DEBUG
|
206
|
-
when
|
207
|
-
@stdout_logger.level=Logger::WARN
|
209
|
+
when /warn/i
|
210
|
+
@stdout_logger.level=Logger::WARN
|
211
|
+
when /fatal/i
|
212
|
+
@stdout_logger.level=Logger::FATAL
|
213
|
+
else
|
214
|
+
throw "Unknown log level #{@info_level}"
|
208
215
|
end
|
209
216
|
|
210
217
|
end
|
@@ -215,8 +222,9 @@ class Constants
|
|
215
222
|
if ( @stdout_logger == nil || @file_logger == nil)
|
216
223
|
initialize_loggers
|
217
224
|
end
|
218
|
-
|
219
|
-
|
225
|
+
|
226
|
+
@stdout_logger.send(level,message)
|
227
|
+
@file_logger.send(level,message)
|
220
228
|
end
|
221
229
|
|
222
230
|
def path_for_builtin_database(dbname)
|
data/lib/protk/gffdb.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'protk/constants'
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
|
5
|
+
class GFFDB
|
6
|
+
|
7
|
+
attr_accessor :id_to_records_map
|
8
|
+
|
9
|
+
def initialize(gff_file_path)
|
10
|
+
env = Constants.new
|
11
|
+
@database = gff_file_path
|
12
|
+
@id_to_records_map={}
|
13
|
+
@id_to_cds_map={}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.create(gff_file_path)
|
17
|
+
db = GFFDB.new(gff_file_path)
|
18
|
+
db.make_index(gff_file_path)
|
19
|
+
db
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_by_id(entry_id)
|
23
|
+
@id_to_records_map[entry_id]
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_cds_by_parent_id(entry_id)
|
27
|
+
@id_to_cds_map[entry_id]
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def make_index(input_gff)
|
32
|
+
io = File.open(input_gff, "r")
|
33
|
+
gffdb = Bio::GFF::GFF3.new(io) #parses the entire db
|
34
|
+
|
35
|
+
# Now create the mapping from ids to records
|
36
|
+
gffdb.records.each do |record|
|
37
|
+
|
38
|
+
@id_to_records_map[record.id] = [] if @id_to_records_map[record.id].nil?
|
39
|
+
@id_to_records_map[record.id] << record
|
40
|
+
|
41
|
+
begin
|
42
|
+
# puts record.feature_type.match(/CDS/)
|
43
|
+
if record.feature_type.to_s =~ /CDS/i
|
44
|
+
# puts record.feature_type
|
45
|
+
parent_id=record.attributes_to_hash['Parent']
|
46
|
+
# puts parent_id
|
47
|
+
if parent_id
|
48
|
+
@id_to_cds_map[parent_id] = [] if @id_to_cds_map[parent_id].nil?
|
49
|
+
@id_to_cds_map[parent_id] << record
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
rescue
|
54
|
+
puts "Problem initializing cds map for #{record}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
require 'bio'
|
3
|
+
require 'protk/bio_gff3_extensions'
|
4
|
+
include LibXML
|
5
|
+
|
6
|
+
class PeptideNotInProteinError < StandardError
|
7
|
+
end
|
8
|
+
|
9
|
+
class Peptide
|
10
|
+
|
11
|
+
attr_accessor :sequence
|
12
|
+
attr_accessor :protein_name
|
13
|
+
attr_accessor :charge
|
14
|
+
attr_accessor :nsp_adjusted_probability
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
class << self
|
19
|
+
def from_protxml(xmlnode)
|
20
|
+
pep=new()
|
21
|
+
pep.sequence=xmlnode['peptide_sequence']
|
22
|
+
pep.nsp_adjusted_probability=xmlnode['nsp_adjusted_probability'].to_f
|
23
|
+
pep.charge=xmlnode['charge'].to_i
|
24
|
+
pep
|
25
|
+
end
|
26
|
+
|
27
|
+
def from_sequence(seq,charge=nil)
|
28
|
+
pep=new()
|
29
|
+
pep.sequence=seq
|
30
|
+
pep.charge=charge
|
31
|
+
pep
|
32
|
+
end
|
33
|
+
private :new
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize()
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# Expects prot_seq not to contain explicit stop codon (ie * at end)
|
41
|
+
# AA coords are 0-based unlike genomic coords which are 1 based
|
42
|
+
#
|
43
|
+
def coords_in_protein(prot_seq,reverse=false)
|
44
|
+
if reverse
|
45
|
+
pep_index = prot_seq.reverse.index(self.sequence.reverse)
|
46
|
+
raise PeptideNotInProteinError if pep_index.nil?
|
47
|
+
pep_start_i = pep_index
|
48
|
+
else
|
49
|
+
pep_start_i = prot_seq.index(self.sequence)
|
50
|
+
raise PeptideNotInProteinError if pep_start_i.nil?
|
51
|
+
end
|
52
|
+
pep_end_i = pep_start_i+self.sequence.length
|
53
|
+
{:start => pep_start_i,:end => pep_end_i}
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates
|
58
|
+
#
|
59
|
+
# Assumes that cds_coords is inclusive of the entire protein sequence including start-met
|
60
|
+
#
|
61
|
+
# We assume that gff records conform to the spec
|
62
|
+
#
|
63
|
+
# http://www.sequenceontology.org/gff3.shtml
|
64
|
+
#
|
65
|
+
# This part of the spec is crucial
|
66
|
+
#
|
67
|
+
# - The START and STOP codons are included in the CDS.
|
68
|
+
# - That is, if the locations of the start and stop codons are known,
|
69
|
+
# - the first three base pairs of the CDS should correspond to the start codon
|
70
|
+
# - and the last three correspond the stop codon.
|
71
|
+
#
|
72
|
+
# We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)
|
73
|
+
#
|
74
|
+
def to_gff3_records(prot_seq,parent_record,cds_records)
|
75
|
+
|
76
|
+
throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
|
77
|
+
throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
|
78
|
+
|
79
|
+
on_reverse_strand = (parent_record.strand=="-") ? true : false
|
80
|
+
aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
|
81
|
+
|
82
|
+
ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
|
83
|
+
|
84
|
+
# Initial position is the number of NA's from the start of translation
|
85
|
+
#
|
86
|
+
pep_nalen = self.sequence.length*3
|
87
|
+
|
88
|
+
i = 0; #Current protein position (in nucleic acids)
|
89
|
+
|
90
|
+
pep_start_i = aa_coords[:start]*3
|
91
|
+
pep_end_i = pep_start_i+self.sequence.length*3
|
92
|
+
fragments=[]
|
93
|
+
ordered_cds_records.each do |cds_record|
|
94
|
+
# puts cds_record
|
95
|
+
fragment = nil
|
96
|
+
fragment_len = 0
|
97
|
+
if on_reverse_strand
|
98
|
+
|
99
|
+
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
100
|
+
before_len = [pep_start_i-i,0].max
|
101
|
+
# puts before_len
|
102
|
+
# puts in_peptide
|
103
|
+
# puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
|
104
|
+
if in_peptide
|
105
|
+
|
106
|
+
fragment_end = cds_record.end
|
107
|
+
fragment_len = [cds_record.length,pep_end_i-i].min
|
108
|
+
fragment_start = fragment_end-fragment_len+1
|
109
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
110
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
111
|
+
|
112
|
+
elsif before_len>0
|
113
|
+
fragment_end = cds_record.end - before_len
|
114
|
+
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
115
|
+
# puts "Frag len #{fragment_len}"
|
116
|
+
fragment_start = fragment_end - fragment_len + 1
|
117
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
118
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
119
|
+
else
|
120
|
+
fragment=nil
|
121
|
+
end
|
122
|
+
else
|
123
|
+
in_peptide = (i<pep_end_i) && (i>=pep_start_i)
|
124
|
+
before_len = [pep_start_i-i,0].max
|
125
|
+
if in_peptide
|
126
|
+
fragment_start = cds_record.start
|
127
|
+
fragment_len = [cds_record.length,pep_end_i-i].min
|
128
|
+
fragment_end = fragment_start+fragment_len-1
|
129
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
130
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
131
|
+
elsif before_len>0
|
132
|
+
fragment_start = cds_record.start + before_len
|
133
|
+
fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
|
134
|
+
fragment_end = fragment_start + fragment_len-1
|
135
|
+
# fragment = {:start=>fragment_start,:end=>fragment_end}
|
136
|
+
fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
|
137
|
+
else
|
138
|
+
fragment=nil
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
i+=cds_record.length
|
143
|
+
fragments << fragment unless fragment.nil?
|
144
|
+
end
|
145
|
+
fragments
|
146
|
+
end
|
147
|
+
|
148
|
+
def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
|
149
|
+
cds_id = parent_record.id
|
150
|
+
this_id = "#{cds_id}.#{self.sequence}"
|
151
|
+
this_id << ".#{self.charge}" unless self.charge.nil?
|
152
|
+
score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
|
153
|
+
gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
154
|
+
Bio::GFF::GFF3::Record.new(gff_string)
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'protk/peptide'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
|
6
|
+
class Protein
|
7
|
+
|
8
|
+
attr_accessor :group_number
|
9
|
+
attr_accessor :group_probability
|
10
|
+
attr_accessor :probability
|
11
|
+
attr_accessor :sequence
|
12
|
+
attr_accessor :protein_name
|
13
|
+
attr_accessor :n_indistinguishable_proteins
|
14
|
+
attr_accessor :percent_coverage
|
15
|
+
attr_accessor :peptides
|
16
|
+
|
17
|
+
class << self
|
18
|
+
|
19
|
+
# <protein_group group_number="1" probability="1.0000">
|
20
|
+
# <protein protein_name="ACADV_MOUSE" n_indistinguishable_proteins="1" probability="1.0000" percent_coverage="9.9" unique_stripped_peptides="ELGAFGLQVPSELGGLGLSNTQYAR+GIVNEQFLLQR+SGELAVQALDQFATVVEAK+VAVNILNNGR" group_sibling_id="a" total_number_peptides="4" pct_spectrum_ids="0.41" confidence="1.00">
|
21
|
+
# <parameter name="prot_length" value="656"/>
|
22
|
+
# <annotation protein_description="Very long-chain specific acyl-CoA dehydrogenase, mitochondrial OS=Mus musculus GN=Acadvl PE=1 SV=3"/>
|
23
|
+
# <peptide peptide_sequence="SGELAVQALDQFATVVEAK" charge="1" initial_probability="0.9919" nsp_adjusted_probability="0.9981" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1975.0340">
|
24
|
+
# </peptide>
|
25
|
+
# <peptide peptide_sequence="GIVNEQFLLQR" charge="1" initial_probability="0.9909" nsp_adjusted_probability="0.9979" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1315.7250">
|
26
|
+
# </peptide>
|
27
|
+
# <peptide peptide_sequence="ELGAFGLQVPSELGGLGLSNTQYAR" charge="1" initial_probability="0.7792" nsp_adjusted_probability="0.9391" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.55" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.78" is_contributing_evidence="Y" calc_neutral_pep_mass="2576.3234">
|
28
|
+
# </peptide>
|
29
|
+
# <peptide peptide_sequence="VAVNILNNGR" charge="1" initial_probability="0.5674" nsp_adjusted_probability="0.8515" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.76" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.57" is_contributing_evidence="Y" calc_neutral_pep_mass="1068.6030">
|
30
|
+
# </peptide>
|
31
|
+
# </protein>
|
32
|
+
# </protein_group>
|
33
|
+
|
34
|
+
|
35
|
+
def from_protxml(xmlnode)
|
36
|
+
prot=new()
|
37
|
+
groupnode = xmlnode.parent
|
38
|
+
prot.group_probability = groupnode['probability'].to_f
|
39
|
+
prot.group_number = groupnode['group_number'].to_i
|
40
|
+
prot.probability = xmlnode['probability'].to_f
|
41
|
+
prot.protein_name = xmlnode['protein_name']
|
42
|
+
prot.n_indistinguishable_proteins = xmlnode['n_indistinguishable_proteins'].to_i
|
43
|
+
prot.percent_coverage = xmlnode['percent_coverage'].to_f
|
44
|
+
|
45
|
+
peptide_nodes = xmlnode.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
46
|
+
prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
|
47
|
+
prot
|
48
|
+
end
|
49
|
+
private :new
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize()
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return just one peptide for each unique sequence choosing the peptide with highest probability
|
57
|
+
#
|
58
|
+
def representative_peptides()
|
59
|
+
best_peptides={}
|
60
|
+
self.peptides.each do |peptide|
|
61
|
+
seq = peptide.sequence
|
62
|
+
if best_peptides[seq].nil?
|
63
|
+
best_peptides[seq]=peptide
|
64
|
+
else
|
65
|
+
best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
best_peptides.values
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|