protk 1.3.0 → 1.3.1.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/protk.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'protk/tool.rb'
2
2
  require 'protk/swissprot_database.rb'
3
3
  require 'protk/search_tool.rb'
4
- require 'protk/protxml.rb'
5
4
  require 'protk/prophet_tool.rb'
6
5
  require 'protk/omssa_util.rb'
7
6
  require 'protk/mascot_util.rb'
@@ -0,0 +1,22 @@
1
+ require 'bio'
2
+
3
+ # Extension to GFF3 records to support genomic coordinate mapping tasks
4
+
5
+ class Bio::GFF::GFF3::Record
6
+
7
+
8
+ # Comparator to allow sorting by start
9
+
10
+ # Overlap operator to return a new gff by overlapping this one with another
11
+
12
+ # Function to return our coordinates relative to some other coordinate system (eg a protein)
13
+
14
+ def <=>(otherRecord)
15
+ self.start <=> otherRecord.start
16
+ end
17
+
18
+ def length
19
+ return self.end-self.start+1
20
+ end
21
+
22
+ end
@@ -90,7 +90,7 @@ class Bio::SPTR < Bio::EMBLDB
90
90
  # SwissProt Accessions
91
91
  #
92
92
  def accessions
93
- return ""
93
+ return self.ac
94
94
  end
95
95
 
96
96
  # Subcellular Location
@@ -132,7 +132,8 @@ class Bio::SPTR < Bio::EMBLDB
132
132
  def domain
133
133
  return self.cc["DOMAIN"].to_s
134
134
  end
135
-
135
+
136
+
136
137
  #
137
138
  # Getting dr entry
138
139
  #
@@ -152,6 +153,20 @@ class Bio::SPTR < Bio::EMBLDB
152
153
  def ipi
153
154
  return self.safely_get_drentry_for_key("IPI")
154
155
  end
156
+
157
+ def go_terms
158
+ terms = self.dr["GO"]
159
+ if terms
160
+ return terms.collect { |e| e[0] }
161
+ else
162
+ return nil
163
+ end
164
+ end
165
+
166
+ def go_entries
167
+ return self.dr["GO"]
168
+ end
169
+
155
170
 
156
171
  # Intact accession number
157
172
  #
@@ -234,8 +249,8 @@ class Bio::SPTR < Bio::EMBLDB
234
249
  return self.seq.to_s
235
250
  end
236
251
 
237
- def tax_dump
238
- return self.ox.to_s
252
+ def ncbi_taxon_id
253
+ return self.ox["NCBI_TaxID"]
239
254
  end
240
255
 
241
256
  def species_dump
@@ -29,16 +29,19 @@ class Constants
29
29
  attr :info_level
30
30
  attr :protk_dir
31
31
  attr :data_lib_dir
32
+ attr_accessor :info_level
32
33
 
33
34
  # Provides direct access to constants through methods of the same name
34
35
  # This will be used for all constants other than paths
35
36
  #
36
37
  def method_missing(method)
38
+
37
39
  from_env = @env[method.to_s]
38
40
  throw "#{method} is undefined" unless from_env!=nil
39
41
  from_env
40
42
  end
41
43
 
44
+
42
45
  # Some constants are paths. They need to be translated into real paths before being returned
43
46
  #
44
47
 
@@ -121,7 +124,7 @@ class Constants
121
124
  # Read the global constants file and initialize our class @env variable
122
125
  # Initialize loggers
123
126
  #
124
- def initialize
127
+ def initialize()
125
128
 
126
129
  @data_lib_dir="#{File.dirname(__FILE__)}/data"
127
130
  @protk_dir="#{Dir.home}/.protk"
@@ -170,8 +173,10 @@ class Constants
170
173
 
171
174
  # puts "Path #{ENV['PATH']}"
172
175
  throw "No data found in config file" unless @env!=nil
173
- @info_level=default_config_yml['message_level']
174
-
176
+
177
+ @info_level="fatal"
178
+ @info_level=default_config_yml['message_level'] unless default_config_yml['message_level'].nil?
179
+
175
180
  end
176
181
 
177
182
 
@@ -196,15 +201,17 @@ class Constants
196
201
  throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
197
202
  throw "Unable to create stdout logger " unless @stdout_logger!=nil
198
203
 
199
-
200
-
201
204
  case @info_level
202
- when "info"
205
+ when /info/i
203
206
  @stdout_logger.level=Logger::INFO
204
- when "debug"
207
+ when /debug/i
205
208
  @stdout_logger.level=Logger::DEBUG
206
- when "warn"
207
- @stdout_logger.level=Logger::WARN
209
+ when /warn/i
210
+ @stdout_logger.level=Logger::WARN
211
+ when /fatal/i
212
+ @stdout_logger.level=Logger::FATAL
213
+ else
214
+ throw "Unknown log level #{@info_level}"
208
215
  end
209
216
 
210
217
  end
@@ -215,8 +222,9 @@ class Constants
215
222
  if ( @stdout_logger == nil || @file_logger == nil)
216
223
  initialize_loggers
217
224
  end
218
- @stdout_logger.send(level,message)
219
- @file_logger.send(level,message)
225
+
226
+ @stdout_logger.send(level,message)
227
+ @file_logger.send(level,message)
220
228
  end
221
229
 
222
230
  def path_for_builtin_database(dbname)
@@ -0,0 +1,60 @@
1
+ require 'protk/constants'
2
+ require 'bio'
3
+
4
+
5
+ class GFFDB
6
+
7
+ attr_accessor :id_to_records_map
8
+
9
+ def initialize(gff_file_path)
10
+ env = Constants.new
11
+ @database = gff_file_path
12
+ @id_to_records_map={}
13
+ @id_to_cds_map={}
14
+ end
15
+
16
+ def self.create(gff_file_path)
17
+ db = GFFDB.new(gff_file_path)
18
+ db.make_index(gff_file_path)
19
+ db
20
+ end
21
+
22
+ def get_by_id(entry_id)
23
+ @id_to_records_map[entry_id]
24
+ end
25
+
26
+ def get_cds_by_parent_id(entry_id)
27
+ @id_to_cds_map[entry_id]
28
+ end
29
+
30
+
31
+ def make_index(input_gff)
32
+ io = File.open(input_gff, "r")
33
+ gffdb = Bio::GFF::GFF3.new(io) #parses the entire db
34
+
35
+ # Now create the mapping from ids to records
36
+ gffdb.records.each do |record|
37
+
38
+ @id_to_records_map[record.id] = [] if @id_to_records_map[record.id].nil?
39
+ @id_to_records_map[record.id] << record
40
+
41
+ begin
42
+ # puts record.feature_type.match(/CDS/)
43
+ if record.feature_type.to_s =~ /CDS/i
44
+ # puts record.feature_type
45
+ parent_id=record.attributes_to_hash['Parent']
46
+ # puts parent_id
47
+ if parent_id
48
+ @id_to_cds_map[parent_id] = [] if @id_to_cds_map[parent_id].nil?
49
+ @id_to_cds_map[parent_id] << record
50
+ end
51
+ end
52
+
53
+ rescue
54
+ puts "Problem initializing cds map for #{record}"
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,158 @@
1
+ require 'libxml'
2
+ require 'bio'
3
+ require 'protk/bio_gff3_extensions'
4
+ include LibXML
5
+
6
+ class PeptideNotInProteinError < StandardError
7
+ end
8
+
9
+ class Peptide
10
+
11
+ attr_accessor :sequence
12
+ attr_accessor :protein_name
13
+ attr_accessor :charge
14
+ attr_accessor :nsp_adjusted_probability
15
+
16
+
17
+
18
+ class << self
19
+ def from_protxml(xmlnode)
20
+ pep=new()
21
+ pep.sequence=xmlnode['peptide_sequence']
22
+ pep.nsp_adjusted_probability=xmlnode['nsp_adjusted_probability'].to_f
23
+ pep.charge=xmlnode['charge'].to_i
24
+ pep
25
+ end
26
+
27
+ def from_sequence(seq,charge=nil)
28
+ pep=new()
29
+ pep.sequence=seq
30
+ pep.charge=charge
31
+ pep
32
+ end
33
+ private :new
34
+ end
35
+
36
+ def initialize()
37
+
38
+ end
39
+
40
+ # Expects prot_seq not to contain explicit stop codon (ie * at end)
41
+ # AA coords are 0-based unlike genomic coords which are 1 based
42
+ #
43
+ def coords_in_protein(prot_seq,reverse=false)
44
+ if reverse
45
+ pep_index = prot_seq.reverse.index(self.sequence.reverse)
46
+ raise PeptideNotInProteinError if pep_index.nil?
47
+ pep_start_i = pep_index
48
+ else
49
+ pep_start_i = prot_seq.index(self.sequence)
50
+ raise PeptideNotInProteinError if pep_start_i.nil?
51
+ end
52
+ pep_end_i = pep_start_i+self.sequence.length
53
+ {:start => pep_start_i,:end => pep_end_i}
54
+ end
55
+
56
+
57
+ # Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates
58
+ #
59
+ # Assumes that cds_coords is inclusive of the entire protein sequence including start-met
60
+ #
61
+ # We assume that gff records conform to the spec
62
+ #
63
+ # http://www.sequenceontology.org/gff3.shtml
64
+ #
65
+ # This part of the spec is crucial
66
+ #
67
+ # - The START and STOP codons are included in the CDS.
68
+ # - That is, if the locations of the start and stop codons are known,
69
+ # - the first three base pairs of the CDS should correspond to the start codon
70
+ # - and the last three correspond the stop codon.
71
+ #
72
+ # We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)
73
+ #
74
+ def to_gff3_records(prot_seq,parent_record,cds_records)
75
+
76
+ throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
77
+ throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
78
+
79
+ on_reverse_strand = (parent_record.strand=="-") ? true : false
80
+ aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
81
+
82
+ ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
83
+
84
+ # Initial position is the number of NA's from the start of translation
85
+ #
86
+ pep_nalen = self.sequence.length*3
87
+
88
+ i = 0; #Current protein position (in nucleic acids)
89
+
90
+ pep_start_i = aa_coords[:start]*3
91
+ pep_end_i = pep_start_i+self.sequence.length*3
92
+ fragments=[]
93
+ ordered_cds_records.each do |cds_record|
94
+ # puts cds_record
95
+ fragment = nil
96
+ fragment_len = 0
97
+ if on_reverse_strand
98
+
99
+ in_peptide = (i<pep_end_i) && (i>=pep_start_i)
100
+ before_len = [pep_start_i-i,0].max
101
+ # puts before_len
102
+ # puts in_peptide
103
+ # puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
104
+ if in_peptide
105
+
106
+ fragment_end = cds_record.end
107
+ fragment_len = [cds_record.length,pep_end_i-i].min
108
+ fragment_start = fragment_end-fragment_len+1
109
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
110
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
111
+
112
+ elsif before_len>0
113
+ fragment_end = cds_record.end - before_len
114
+ fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
115
+ # puts "Frag len #{fragment_len}"
116
+ fragment_start = fragment_end - fragment_len + 1
117
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
118
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
119
+ else
120
+ fragment=nil
121
+ end
122
+ else
123
+ in_peptide = (i<pep_end_i) && (i>=pep_start_i)
124
+ before_len = [pep_start_i-i,0].max
125
+ if in_peptide
126
+ fragment_start = cds_record.start
127
+ fragment_len = [cds_record.length,pep_end_i-i].min
128
+ fragment_end = fragment_start+fragment_len-1
129
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
130
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
+ elsif before_len>0
132
+ fragment_start = cds_record.start + before_len
133
+ fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
134
+ fragment_end = fragment_start + fragment_len-1
135
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
136
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
137
+ else
138
+ fragment=nil
139
+ end
140
+
141
+ end
142
+ i+=cds_record.length
143
+ fragments << fragment unless fragment.nil?
144
+ end
145
+ fragments
146
+ end
147
+
148
+ def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
149
+ cds_id = parent_record.id
150
+ this_id = "#{cds_id}.#{self.sequence}"
151
+ this_id << ".#{self.charge}" unless self.charge.nil?
152
+ score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
153
+ gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
154
+ Bio::GFF::GFF3::Record.new(gff_string)
155
+ end
156
+
157
+
158
+ end
@@ -0,0 +1,72 @@
1
+ require 'protk/peptide'
2
+
3
+ include LibXML
4
+
5
+
6
+ class Protein
7
+
8
+ attr_accessor :group_number
9
+ attr_accessor :group_probability
10
+ attr_accessor :probability
11
+ attr_accessor :sequence
12
+ attr_accessor :protein_name
13
+ attr_accessor :n_indistinguishable_proteins
14
+ attr_accessor :percent_coverage
15
+ attr_accessor :peptides
16
+
17
+ class << self
18
+
19
+ # <protein_group group_number="1" probability="1.0000">
20
+ # <protein protein_name="ACADV_MOUSE" n_indistinguishable_proteins="1" probability="1.0000" percent_coverage="9.9" unique_stripped_peptides="ELGAFGLQVPSELGGLGLSNTQYAR+GIVNEQFLLQR+SGELAVQALDQFATVVEAK+VAVNILNNGR" group_sibling_id="a" total_number_peptides="4" pct_spectrum_ids="0.41" confidence="1.00">
21
+ # <parameter name="prot_length" value="656"/>
22
+ # <annotation protein_description="Very long-chain specific acyl-CoA dehydrogenase, mitochondrial OS=Mus musculus GN=Acadvl PE=1 SV=3"/>
23
+ # <peptide peptide_sequence="SGELAVQALDQFATVVEAK" charge="1" initial_probability="0.9919" nsp_adjusted_probability="0.9981" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1975.0340">
24
+ # </peptide>
25
+ # <peptide peptide_sequence="GIVNEQFLLQR" charge="1" initial_probability="0.9909" nsp_adjusted_probability="0.9979" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1315.7250">
26
+ # </peptide>
27
+ # <peptide peptide_sequence="ELGAFGLQVPSELGGLGLSNTQYAR" charge="1" initial_probability="0.7792" nsp_adjusted_probability="0.9391" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.55" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.78" is_contributing_evidence="Y" calc_neutral_pep_mass="2576.3234">
28
+ # </peptide>
29
+ # <peptide peptide_sequence="VAVNILNNGR" charge="1" initial_probability="0.5674" nsp_adjusted_probability="0.8515" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.76" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.57" is_contributing_evidence="Y" calc_neutral_pep_mass="1068.6030">
30
+ # </peptide>
31
+ # </protein>
32
+ # </protein_group>
33
+
34
+
35
+ def from_protxml(xmlnode)
36
+ prot=new()
37
+ groupnode = xmlnode.parent
38
+ prot.group_probability = groupnode['probability'].to_f
39
+ prot.group_number = groupnode['group_number'].to_i
40
+ prot.probability = xmlnode['probability'].to_f
41
+ prot.protein_name = xmlnode['protein_name']
42
+ prot.n_indistinguishable_proteins = xmlnode['n_indistinguishable_proteins'].to_i
43
+ prot.percent_coverage = xmlnode['percent_coverage'].to_f
44
+
45
+ peptide_nodes = xmlnode.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
46
+ prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
47
+ prot
48
+ end
49
+ private :new
50
+ end
51
+
52
+ def initialize()
53
+
54
+ end
55
+
56
+ # Return just one peptide for each unique sequence choosing the peptide with highest probability
57
+ #
58
+ def representative_peptides()
59
+ best_peptides={}
60
+ self.peptides.each do |peptide|
61
+ seq = peptide.sequence
62
+ if best_peptides[seq].nil?
63
+ best_peptides[seq]=peptide
64
+ else
65
+ best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
66
+ end
67
+ end
68
+
69
+ best_peptides.values
70
+ end
71
+
72
+ end