protk 1.3.0 → 1.3.1.pre2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/protk.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'protk/tool.rb'
2
2
  require 'protk/swissprot_database.rb'
3
3
  require 'protk/search_tool.rb'
4
- require 'protk/protxml.rb'
5
4
  require 'protk/prophet_tool.rb'
6
5
  require 'protk/omssa_util.rb'
7
6
  require 'protk/mascot_util.rb'
@@ -0,0 +1,22 @@
1
+ require 'bio'
2
+
3
+ # Extension to GFF3 records to support genomic coordinate mapping tasks
4
+
5
+ class Bio::GFF::GFF3::Record
6
+
7
+
8
+ # Comparator to allow sorting by start
9
+
10
+ # Overlap operator to return a new gff by overlapping this one with another
11
+
12
+ # Function to return our coordinates relative to some other coordinate system (eg a protein)
13
+
14
+ def <=>(otherRecord)
15
+ self.start <=> otherRecord.start
16
+ end
17
+
18
+ def length
19
+ return self.end-self.start+1
20
+ end
21
+
22
+ end
@@ -90,7 +90,7 @@ class Bio::SPTR < Bio::EMBLDB
90
90
  # SwissProt Accessions
91
91
  #
92
92
  def accessions
93
- return ""
93
+ return self.ac
94
94
  end
95
95
 
96
96
  # Subcellular Location
@@ -132,7 +132,8 @@ class Bio::SPTR < Bio::EMBLDB
132
132
  def domain
133
133
  return self.cc["DOMAIN"].to_s
134
134
  end
135
-
135
+
136
+
136
137
  #
137
138
  # Getting dr entry
138
139
  #
@@ -152,6 +153,20 @@ class Bio::SPTR < Bio::EMBLDB
152
153
  def ipi
153
154
  return self.safely_get_drentry_for_key("IPI")
154
155
  end
156
+
157
+ def go_terms
158
+ terms = self.dr["GO"]
159
+ if terms
160
+ return terms.collect { |e| e[0] }
161
+ else
162
+ return nil
163
+ end
164
+ end
165
+
166
+ def go_entries
167
+ return self.dr["GO"]
168
+ end
169
+
155
170
 
156
171
  # Intact accession number
157
172
  #
@@ -234,8 +249,8 @@ class Bio::SPTR < Bio::EMBLDB
234
249
  return self.seq.to_s
235
250
  end
236
251
 
237
- def tax_dump
238
- return self.ox.to_s
252
+ def ncbi_taxon_id
253
+ return self.ox["NCBI_TaxID"]
239
254
  end
240
255
 
241
256
  def species_dump
@@ -29,16 +29,19 @@ class Constants
29
29
  attr :info_level
30
30
  attr :protk_dir
31
31
  attr :data_lib_dir
32
+ attr_accessor :info_level
32
33
 
33
34
  # Provides direct access to constants through methods of the same name
34
35
  # This will be used for all constants other than paths
35
36
  #
36
37
  def method_missing(method)
38
+
37
39
  from_env = @env[method.to_s]
38
40
  throw "#{method} is undefined" unless from_env!=nil
39
41
  from_env
40
42
  end
41
43
 
44
+
42
45
  # Some constants are paths. They need to be translated into real paths before being returned
43
46
  #
44
47
 
@@ -121,7 +124,7 @@ class Constants
121
124
  # Read the global constants file and initialize our class @env variable
122
125
  # Initialize loggers
123
126
  #
124
- def initialize
127
+ def initialize()
125
128
 
126
129
  @data_lib_dir="#{File.dirname(__FILE__)}/data"
127
130
  @protk_dir="#{Dir.home}/.protk"
@@ -170,8 +173,10 @@ class Constants
170
173
 
171
174
  # puts "Path #{ENV['PATH']}"
172
175
  throw "No data found in config file" unless @env!=nil
173
- @info_level=default_config_yml['message_level']
174
-
176
+
177
+ @info_level="fatal"
178
+ @info_level=default_config_yml['message_level'] unless default_config_yml['message_level'].nil?
179
+
175
180
  end
176
181
 
177
182
 
@@ -196,15 +201,17 @@ class Constants
196
201
  throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
197
202
  throw "Unable to create stdout logger " unless @stdout_logger!=nil
198
203
 
199
-
200
-
201
204
  case @info_level
202
- when "info"
205
+ when /info/i
203
206
  @stdout_logger.level=Logger::INFO
204
- when "debug"
207
+ when /debug/i
205
208
  @stdout_logger.level=Logger::DEBUG
206
- when "warn"
207
- @stdout_logger.level=Logger::WARN
209
+ when /warn/i
210
+ @stdout_logger.level=Logger::WARN
211
+ when /fatal/i
212
+ @stdout_logger.level=Logger::FATAL
213
+ else
214
+ throw "Unknown log level #{@info_level}"
208
215
  end
209
216
 
210
217
  end
@@ -215,8 +222,9 @@ class Constants
215
222
  if ( @stdout_logger == nil || @file_logger == nil)
216
223
  initialize_loggers
217
224
  end
218
- @stdout_logger.send(level,message)
219
- @file_logger.send(level,message)
225
+
226
+ @stdout_logger.send(level,message)
227
+ @file_logger.send(level,message)
220
228
  end
221
229
 
222
230
  def path_for_builtin_database(dbname)
@@ -0,0 +1,60 @@
1
+ require 'protk/constants'
2
+ require 'bio'
3
+
4
+
5
+ class GFFDB
6
+
7
+ attr_accessor :id_to_records_map
8
+
9
+ def initialize(gff_file_path)
10
+ env = Constants.new
11
+ @database = gff_file_path
12
+ @id_to_records_map={}
13
+ @id_to_cds_map={}
14
+ end
15
+
16
+ def self.create(gff_file_path)
17
+ db = GFFDB.new(gff_file_path)
18
+ db.make_index(gff_file_path)
19
+ db
20
+ end
21
+
22
+ def get_by_id(entry_id)
23
+ @id_to_records_map[entry_id]
24
+ end
25
+
26
+ def get_cds_by_parent_id(entry_id)
27
+ @id_to_cds_map[entry_id]
28
+ end
29
+
30
+
31
+ def make_index(input_gff)
32
+ io = File.open(input_gff, "r")
33
+ gffdb = Bio::GFF::GFF3.new(io) #parses the entire db
34
+
35
+ # Now create the mapping from ids to records
36
+ gffdb.records.each do |record|
37
+
38
+ @id_to_records_map[record.id] = [] if @id_to_records_map[record.id].nil?
39
+ @id_to_records_map[record.id] << record
40
+
41
+ begin
42
+ # puts record.feature_type.match(/CDS/)
43
+ if record.feature_type.to_s =~ /CDS/i
44
+ # puts record.feature_type
45
+ parent_id=record.attributes_to_hash['Parent']
46
+ # puts parent_id
47
+ if parent_id
48
+ @id_to_cds_map[parent_id] = [] if @id_to_cds_map[parent_id].nil?
49
+ @id_to_cds_map[parent_id] << record
50
+ end
51
+ end
52
+
53
+ rescue
54
+ puts "Problem initializing cds map for #{record}"
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,158 @@
1
+ require 'libxml'
2
+ require 'bio'
3
+ require 'protk/bio_gff3_extensions'
4
+ include LibXML
5
+
6
+ class PeptideNotInProteinError < StandardError
7
+ end
8
+
9
+ class Peptide
10
+
11
+ attr_accessor :sequence
12
+ attr_accessor :protein_name
13
+ attr_accessor :charge
14
+ attr_accessor :nsp_adjusted_probability
15
+
16
+
17
+
18
+ class << self
19
+ def from_protxml(xmlnode)
20
+ pep=new()
21
+ pep.sequence=xmlnode['peptide_sequence']
22
+ pep.nsp_adjusted_probability=xmlnode['nsp_adjusted_probability'].to_f
23
+ pep.charge=xmlnode['charge'].to_i
24
+ pep
25
+ end
26
+
27
+ def from_sequence(seq,charge=nil)
28
+ pep=new()
29
+ pep.sequence=seq
30
+ pep.charge=charge
31
+ pep
32
+ end
33
+ private :new
34
+ end
35
+
36
+ def initialize()
37
+
38
+ end
39
+
40
+ # Expects prot_seq not to contain explicit stop codon (ie * at end)
41
+ # AA coords are 0-based unlike genomic coords which are 1 based
42
+ #
43
+ def coords_in_protein(prot_seq,reverse=false)
44
+ if reverse
45
+ pep_index = prot_seq.reverse.index(self.sequence.reverse)
46
+ raise PeptideNotInProteinError if pep_index.nil?
47
+ pep_start_i = pep_index
48
+ else
49
+ pep_start_i = prot_seq.index(self.sequence)
50
+ raise PeptideNotInProteinError if pep_start_i.nil?
51
+ end
52
+ pep_end_i = pep_start_i+self.sequence.length
53
+ {:start => pep_start_i,:end => pep_end_i}
54
+ end
55
+
56
+
57
+ # Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates
58
+ #
59
+ # Assumes that cds_coords is inclusive of the entire protein sequence including start-met
60
+ #
61
+ # We assume that gff records conform to the spec
62
+ #
63
+ # http://www.sequenceontology.org/gff3.shtml
64
+ #
65
+ # This part of the spec is crucial
66
+ #
67
+ # - The START and STOP codons are included in the CDS.
68
+ # - That is, if the locations of the start and stop codons are known,
69
+ # - the first three base pairs of the CDS should correspond to the start codon
70
+ # - and the last three correspond the stop codon.
71
+ #
72
+ # We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)
73
+ #
74
+ def to_gff3_records(prot_seq,parent_record,cds_records)
75
+
76
+ throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
77
+ throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
78
+
79
+ on_reverse_strand = (parent_record.strand=="-") ? true : false
80
+ aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
81
+
82
+ ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
83
+
84
+ # Initial position is the number of NA's from the start of translation
85
+ #
86
+ pep_nalen = self.sequence.length*3
87
+
88
+ i = 0; #Current protein position (in nucleic acids)
89
+
90
+ pep_start_i = aa_coords[:start]*3
91
+ pep_end_i = pep_start_i+self.sequence.length*3
92
+ fragments=[]
93
+ ordered_cds_records.each do |cds_record|
94
+ # puts cds_record
95
+ fragment = nil
96
+ fragment_len = 0
97
+ if on_reverse_strand
98
+
99
+ in_peptide = (i<pep_end_i) && (i>=pep_start_i)
100
+ before_len = [pep_start_i-i,0].max
101
+ # puts before_len
102
+ # puts in_peptide
103
+ # puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
104
+ if in_peptide
105
+
106
+ fragment_end = cds_record.end
107
+ fragment_len = [cds_record.length,pep_end_i-i].min
108
+ fragment_start = fragment_end-fragment_len+1
109
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
110
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
111
+
112
+ elsif before_len>0
113
+ fragment_end = cds_record.end - before_len
114
+ fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
115
+ # puts "Frag len #{fragment_len}"
116
+ fragment_start = fragment_end - fragment_len + 1
117
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
118
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
119
+ else
120
+ fragment=nil
121
+ end
122
+ else
123
+ in_peptide = (i<pep_end_i) && (i>=pep_start_i)
124
+ before_len = [pep_start_i-i,0].max
125
+ if in_peptide
126
+ fragment_start = cds_record.start
127
+ fragment_len = [cds_record.length,pep_end_i-i].min
128
+ fragment_end = fragment_start+fragment_len-1
129
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
130
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
131
+ elsif before_len>0
132
+ fragment_start = cds_record.start + before_len
133
+ fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
134
+ fragment_end = fragment_start + fragment_len-1
135
+ # fragment = {:start=>fragment_start,:end=>fragment_end}
136
+ fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
137
+ else
138
+ fragment=nil
139
+ end
140
+
141
+ end
142
+ i+=cds_record.length
143
+ fragments << fragment unless fragment.nil?
144
+ end
145
+ fragments
146
+ end
147
+
148
+ def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
149
+ cds_id = parent_record.id
150
+ this_id = "#{cds_id}.#{self.sequence}"
151
+ this_id << ".#{self.charge}" unless self.charge.nil?
152
+ score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
153
+ gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
154
+ Bio::GFF::GFF3::Record.new(gff_string)
155
+ end
156
+
157
+
158
+ end
@@ -0,0 +1,72 @@
1
+ require 'protk/peptide'
2
+
3
+ include LibXML
4
+
5
+
6
+ class Protein
7
+
8
+ attr_accessor :group_number
9
+ attr_accessor :group_probability
10
+ attr_accessor :probability
11
+ attr_accessor :sequence
12
+ attr_accessor :protein_name
13
+ attr_accessor :n_indistinguishable_proteins
14
+ attr_accessor :percent_coverage
15
+ attr_accessor :peptides
16
+
17
+ class << self
18
+
19
+ # <protein_group group_number="1" probability="1.0000">
20
+ # <protein protein_name="ACADV_MOUSE" n_indistinguishable_proteins="1" probability="1.0000" percent_coverage="9.9" unique_stripped_peptides="ELGAFGLQVPSELGGLGLSNTQYAR+GIVNEQFLLQR+SGELAVQALDQFATVVEAK+VAVNILNNGR" group_sibling_id="a" total_number_peptides="4" pct_spectrum_ids="0.41" confidence="1.00">
21
+ # <parameter name="prot_length" value="656"/>
22
+ # <annotation protein_description="Very long-chain specific acyl-CoA dehydrogenase, mitochondrial OS=Mus musculus GN=Acadvl PE=1 SV=3"/>
23
+ # <peptide peptide_sequence="SGELAVQALDQFATVVEAK" charge="1" initial_probability="0.9919" nsp_adjusted_probability="0.9981" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1975.0340">
24
+ # </peptide>
25
+ # <peptide peptide_sequence="GIVNEQFLLQR" charge="1" initial_probability="0.9909" nsp_adjusted_probability="0.9979" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1315.7250">
26
+ # </peptide>
27
+ # <peptide peptide_sequence="ELGAFGLQVPSELGGLGLSNTQYAR" charge="1" initial_probability="0.7792" nsp_adjusted_probability="0.9391" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.55" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.78" is_contributing_evidence="Y" calc_neutral_pep_mass="2576.3234">
28
+ # </peptide>
29
+ # <peptide peptide_sequence="VAVNILNNGR" charge="1" initial_probability="0.5674" nsp_adjusted_probability="0.8515" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.76" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.57" is_contributing_evidence="Y" calc_neutral_pep_mass="1068.6030">
30
+ # </peptide>
31
+ # </protein>
32
+ # </protein_group>
33
+
34
+
35
+ def from_protxml(xmlnode)
36
+ prot=new()
37
+ groupnode = xmlnode.parent
38
+ prot.group_probability = groupnode['probability'].to_f
39
+ prot.group_number = groupnode['group_number'].to_i
40
+ prot.probability = xmlnode['probability'].to_f
41
+ prot.protein_name = xmlnode['protein_name']
42
+ prot.n_indistinguishable_proteins = xmlnode['n_indistinguishable_proteins'].to_i
43
+ prot.percent_coverage = xmlnode['percent_coverage'].to_f
44
+
45
+ peptide_nodes = xmlnode.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
46
+ prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
47
+ prot
48
+ end
49
+ private :new
50
+ end
51
+
52
+ def initialize()
53
+
54
+ end
55
+
56
+ # Return just one peptide for each unique sequence choosing the peptide with highest probability
57
+ #
58
+ def representative_peptides()
59
+ best_peptides={}
60
+ self.peptides.each do |peptide|
61
+ seq = peptide.sequence
62
+ if best_peptides[seq].nil?
63
+ best_peptides[seq]=peptide
64
+ else
65
+ best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
66
+ end
67
+ end
68
+
69
+ best_peptides.values
70
+ end
71
+
72
+ end