divvy_proteomics 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'bio-logger', ">=0"
4
+
5
+ # Add dependencies to develop your gem here.
6
+ # Include everything needed to run rake, tests, features, etc.
7
+ group :development do
8
+ gem 'systemu', ">=0"
9
+ gem "rspec", ">= 2.8.0"
10
+ gem "rdoc", ">= 3.12"
11
+ gem "bundler", ">= 1.0.0"
12
+ gem "jeweler", ">= 1.8.4"
13
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Ben J Woodcroft
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # divvy_proteomics
2
+
3
+ Takes a DTASelect CSV file, and parses the reulst
4
+
5
+ ## Install
6
+ Get ruby somehow, if you don't already have it.
7
+ ```
8
+ gem install divvy_spectra
9
+ ```
10
+
11
+ ## Usage
12
+ ```
13
+ $ divvy_spectra -h
14
+
15
+ Usage: divvy_spectra [options] <DTASelect_file>
16
+
17
+ Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.
18
+
19
+ --merge-proteins FILE_OF_IDENTIFIERS
20
+ Provide a space/tab separated file where the identifiers on each row should be treated as one protein
21
+ --whitelist FILE_OF_PROTEINS_TO_REPORT
22
+ Only report proteins that are in this whitelist, after divvying with everything
23
+
24
+ Verbosity:
25
+
26
+ -q, --quiet Run quietly, set logging to ERROR level [default INFO]
27
+ --logger filename Log to file [default stderr]
28
+ --trace options Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG
29
+ ```
30
+
31
+ == Contributing to divvy\_proteomics
32
+
33
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
34
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
35
+ * Fork the project.
36
+ * Start a feature/bugfix branch.
37
+ * Commit and push until you are happy with your contribution.
38
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
39
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
40
+
41
+ == Copyright
42
+
43
+ Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for
44
+ further details.
45
+
data/Rakefile ADDED
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "divvy_proteomics"
18
+ gem.homepage = "http://github.com/wwood/divvy_proteomics"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{divvy up spectra from DTASelect files in a parsimonious way}
21
+ gem.description = %Q{divvy up spectra from DTASelect files in a somewhat parsimonious way}
22
+ gem.email = "donttrustben@gmail.com"
23
+ gem.authors = ["Ben J Woodcroft"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ spec.pattern = 'spec/**/*_spec.rb'
36
+ spec.rcov = true
37
+ end
38
+
39
+ task :default => :spec
40
+
41
+ require 'rdoc/task'
42
+ Rake::RDocTask.new do |rdoc|
43
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
+
45
+ rdoc.rdoc_dir = 'rdoc'
46
+ rdoc.title = "divvy_proteomics #{version}"
47
+ rdoc.rdoc_files.include('README*')
48
+ rdoc.rdoc_files.include('lib/**/*.rb')
49
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/divvy_spectra ADDED
@@ -0,0 +1,362 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'bio-logger'
5
+ require 'pp'
6
+ require 'set'
7
+
8
+ SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
9
+
10
+ # Parse command line options into the options hash
11
+ options = {
12
+ :logger => 'stderr',
13
+ :log_level => 'info',
14
+ :contaminant_prefix => /^CNTM:/,
15
+ }
16
+ o = OptionParser.new do |opts|
17
+ opts.banner = "
18
+ Usage: #{SCRIPT_NAME} [options] <DTASelect_file>
19
+
20
+ Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
21
+
22
+ opts.on("--merge-proteins FILE_OF_IDENTIFIERS", "Provide a space/tab separated file where the identifiers on each row should be treated as one protein") do |file|
23
+ options[:merge_proteins_file] = file
24
+ end
25
+ opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
26
+ options[:whitelist_file] = file
27
+ end
28
+
29
+ # logger options
30
+ opts.separator "\nVerbosity:\n\n"
31
+ opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
32
+ opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
33
+ opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
34
+ end; o.parse!
35
+ if ARGV.length > 1
36
+ $stderr.puts o
37
+ exit 1
38
+ end
39
+ # Setup logging
40
+ Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
41
+
42
+ class SelectedProtein
43
+ attr_accessor :identifier
44
+
45
+ attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
46
+
47
+ attr_accessor :peptides
48
+
49
+ def initialize
50
+ @peptides = []
51
+ end
52
+
53
+ def unique_spectra
54
+ return 0 if @peptides.nil? or @peptides.empty?
55
+ num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
56
+ num ||= 0
57
+ return num
58
+ end
59
+
60
+ def non_unique_spectra
61
+ return 0 if @peptides.nil? or @peptides.empty?
62
+ num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
63
+ num ||= 0
64
+ return num
65
+ end
66
+
67
+ # Are there any peptides that are assigned exclusively to this protein?
68
+ def uniquely_identified_by_any_peptides?
69
+ unique_spectra > 0
70
+ end
71
+
72
+ def estimated_spectral_count
73
+ # How many unique spectra are there for each protein that shares a peptide with the current peptide
74
+ return 0 if @peptides.nil? or @peptides.empty?
75
+ peptide_shares = []
76
+ # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
77
+ if !uniquely_identified_by_any_peptides?
78
+ shared_parents = peptides[0].parent_proteins
79
+ if peptides.find{|pep| pep.parent_proteins != shared_parents}
80
+ log.warn "Found a protein (#{identifier}) that shares all its peptides with a non-constant set of proteins, hoping this is a rare event, estimated spectral count likely wrong"
81
+ end
82
+ num_shared_proteins = shared_parents.length
83
+ num_peptide_spectra = peptides.collect{|pep| pep.redundancy}.reduce(:+)
84
+ log.debug "Found #{num_shared_proteins} shared peptides and #{num_peptide_spectra} peptide spectra"
85
+ return num_peptide_spectra.to_f/num_shared_proteins
86
+ else
87
+ peptides.each do |peptide|
88
+ log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
89
+ log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
90
+ total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
91
+ peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
92
+ end
93
+ return peptide_shares.reduce(:+)
94
+ end
95
+ end
96
+
97
+ def log
98
+ Bio::Log::LoggerPlus[LOG_NAME]
99
+ end
100
+ end
101
+
102
+ class Peptide
103
+ attr_accessor :identifier
104
+
105
+ attr_accessor :reported_unique
106
+
107
+ attr_accessor :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence
108
+
109
+ attr_accessor :unique
110
+
111
+ attr_accessor :parent_proteins
112
+ def initialize
113
+ @parent_proteins = []
114
+ end
115
+
116
+ def inspect
117
+ str = "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')}]"
118
+ [:identifier, :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence].each do |var|
119
+ str += ", #{var}: #{send(var)}"
120
+ end
121
+ return str
122
+ end
123
+ end
124
+
125
+ # Read in merges, if required
126
+ mergers = {}
127
+ if options[:merge_proteins_file]
128
+ File.open(options[:merge_proteins_file]).each_line do |line|
129
+ splits = line.strip.split(/\s+/)
130
+ primary_id = splits[0]
131
+ splits.each_with_index do |s, i|
132
+ next if i==0
133
+ raise "This script can only handle two-way merging at the moment, sorry" if splits.length > 2
134
+ raise "ID supposedly matches to multple identifiers: #{splits[1]}" if mergers[s] and mergers[s] != primary_id
135
+ mergers[s] = primary_id
136
+ end
137
+ end
138
+
139
+ log.info "Merging of identifiers setup for #{mergers.length} different instances, e.g. #{mergers.to_a[0][0]} => #{mergers.to_a[0][1]}"
140
+ end
141
+
142
+ # Read in whitelist
143
+ whitelist = Set.new
144
+ if options[:whitelist_file]
145
+ whitelist = File.open(options[:whitelist_file]).read.split(/\s+/)
146
+ raise "whitelist empty" unless whitelist.length > 0
147
+ log.info "Read in #{whitelist.length} IDs into the whitelist, only those will be reported. e.g. #{whitelist[0]}"
148
+ end
149
+
150
+
151
+ # Hashes of identifiers to objects
152
+ proteins = {}
153
+ hits = {}
154
+
155
+ # Read in the tab separated file
156
+ reading_header = true
157
+ current_proteins = []
158
+ last_line_was_protein_name = false
159
+
160
+ # Parse each line of the DTAselect file
161
+ ARGF.each_line do |line|
162
+ splits = line.chomp.split("\t")
163
+ log.debug "Parsing line `#{line.chomp}'"
164
+
165
+ if reading_header
166
+ log.debug "reading header"
167
+ if splits[0] == 'Unique'
168
+ reading_header = false
169
+ end
170
+ next
171
+ end
172
+
173
+ # OK, now we are reading the actual table, not the header
174
+ if splits[0] != '' and splits[11].nil?
175
+ ident = splits[0]
176
+
177
+ if !last_line_was_protein_name
178
+ # Sometimes several proteins are given all in the one header line
179
+ # start a new protein
180
+ log.debug "New protein now being parsed"
181
+ current_proteins = []
182
+ end
183
+
184
+ current_protein = SelectedProtein.new
185
+ last_line_was_protein_name = true
186
+ current_proteins.push current_protein
187
+
188
+ current_protein.identifier = ident
189
+
190
+ i = 1
191
+ current_protein.sequence_count = splits[i].to_i; i+=1
192
+ current_protein.spectrum_count = splits[i].to_i; i+=1
193
+ current_protein.sequence_coverage = splits[i].to_f; i+=1
194
+ current_protein.length = splits[i].to_i; i+=1
195
+ current_protein.molwt = splits[i].to_f; i+=1
196
+ current_protein.pi = splits[i].to_f; i+=1
197
+ current_protein.validation_status = splits[i].to_f; i+=1
198
+ current_protein.descriptive_name = splits[i]
199
+
200
+ if proteins[ident]
201
+ raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
202
+ end
203
+ proteins[ident] = current_protein
204
+
205
+
206
+
207
+ elsif splits[1] == 'Proteins'
208
+ # Done processing, except for the bits down the bottom which aren't parsed (yet)
209
+ break
210
+
211
+
212
+
213
+ else
214
+ log.debug "New spectra now being parsed"
215
+ last_line_was_protein_name = false
216
+
217
+ # Record a spectra
218
+ ident = splits[1]
219
+ raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
220
+
221
+ pep = hits[ident]
222
+ if pep.nil?
223
+ pep = Peptide.new
224
+ pep.identifier = ident
225
+ pep.reported_unique = splits[0]
226
+
227
+ i = 2
228
+ pep.xcorr = splits[i].to_f; i+= 1
229
+ pep.deltcn = splits[i].to_f; i+= 1
230
+ pep.obs_mono_mz = splits[i].to_f; i+= 1
231
+ pep.cal_mono_mz = splits[i].to_f; i+= 1
232
+ pep.total_intensity = splits[i].to_f; i+= 1
233
+ pep.sp_rank = splits[i].to_f; i+= 1
234
+ pep.sp_score = splits[i].to_f; i+= 1
235
+ pep.ion_proportion = splits[i].to_f; i+= 1
236
+ pep.redundancy = splits[i].to_i; i+= 1
237
+ pep.sequence = splits[i]
238
+
239
+ hits[ident] = pep
240
+ end
241
+
242
+ current_proteins.each do |current_protein|
243
+ pep.parent_proteins.push current_protein
244
+ current_protein.peptides.push pep
245
+ end
246
+ log.debug "Parsed this peptide #{pep.inspect}"
247
+ end
248
+ end
249
+
250
+ log.debug "Proteins parsed: #{proteins.inspect}"
251
+
252
+
253
+ # Merge proteins that are known duplicates if need be
254
+ mergers.each do |secondary_id, primary_id|
255
+ log.debug "Merging proteins #{primary_id} and #{secondary_id}"
256
+ if proteins[primary_id] and proteins[secondary_id]
257
+ # Do the merge
258
+ log.debug "Both are defined, so doing the complicated merge"
259
+
260
+ # Invalidate some things about the primary ID because they are no longer valid
261
+ current_protein = proteins[primary_id]
262
+ current_protein.sequence_count = nil
263
+ current_protein.sequence_coverage = nil
264
+ current_protein.length = nil
265
+ current_protein.molwt = nil
266
+ current_protein.pi = nil
267
+ current_protein.validation_status = nil
268
+ # Keep the primary proteins' description, I reckon
269
+
270
+ # When there is spectra that are in the secondary but not the primary, add them to the primary's repertoire.
271
+ primary = proteins[primary_id]
272
+ secondary = proteins[secondary_id]
273
+ primary_peptide_names = primary.peptides.collect{|pep| pep.identifier}
274
+ log.debug "Before transfer of the second protein's peptides, the primary proteins has #{primary.peptides.length} different peptides"
275
+ log.debug "Parent protein IDs of primary peptides: #{primary.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.inspect}"
276
+ secondary.peptides.each do |sec_pep|
277
+ unless primary_peptide_names.include?(sec_pep.identifier)
278
+ primary.peptides.push sec_pep
279
+ sec_pep.parent_proteins.push primary
280
+ end
281
+ end
282
+ log.debug "After transfer of the second protein's peptides, the primary proteins has #{primary.peptides.length} different peptides"
283
+ log.debug "Parent protein IDs of primary peptides: #{primary.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.inspect}"
284
+ # Remove references second protein from the second peptides
285
+ secondary.peptides.each do |pep|
286
+ pep.parent_proteins.reject!{|pro| pro==secondary}
287
+ end
288
+ log.debug "Parent protein IDs of primary peptides: #{primary.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.inspect}"
289
+ # Remove the secondary peptide from the list of peptides
290
+ proteins.delete secondary_id
291
+
292
+
293
+ elsif proteins[secondary_id]
294
+ raise "You've reached a place in the code that is implemented but untested"
295
+ # Rename the secondary as the primary
296
+ sec = proteins[secondary_id]
297
+ proteins[primary_id] = sec
298
+ proteins.delete secondary_id
299
+ sec.identifier = primary_id
300
+ # The peptide objects should have the correct parent proteins because it is all references
301
+
302
+ end #The other two cases do not require any intervention,
303
+ end
304
+
305
+
306
+ # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
307
+ total_contaminating_spectra = proteins.select{|ident, protein| ident.match(options[:contaminant_prefix])}.collect{|i, pro| pro.estimated_spectral_count}.reduce(:+)
308
+ total_contaminating_spectra ||= 0
309
+
310
+ total_spectra = hits.collect{|i,pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
311
+ log.info "Parsed in #{proteins.length} proteins and #{hits.length} peptides, and #{total_spectra.to_i} non-contaminating spectra"
312
+
313
+ log.debug "Proteins parsed: #{proteins.inspect}"
314
+
315
+ all_peptides = hits.values.uniq
316
+ number_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.length > 1}.length
317
+ number_non_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.length == 1}.length
318
+ total_peptides = number_shared_peptides+number_non_shared_peptides
319
+ log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_peptides*100}%) shared peptides and #{number_non_shared_peptides} (#{number_non_shared_peptides.to_f/total_peptides*100}%) non-shared peptides"
320
+
321
+ # Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
322
+ non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
323
+ peptide.reported_unique == nil and peptide.parent_proteins.length == 1
324
+ end
325
+ log.info "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one."
326
+
327
+ # OK, finished parsing the file. Now output the score for each protein
328
+ puts [
329
+ 'ID',
330
+ 'Unique spectra',
331
+ 'Non-unique spectra',
332
+ 'Estimated total spectra',
333
+ 'Normalised spectral count',
334
+ 'Description',
335
+ 'Proteins sharing spectra',
336
+ ].join "\t"
337
+ proteins.each do |protein_id, protein|
338
+ next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
339
+
340
+ if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
341
+ log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
342
+ if !protein.uniquely_identified_by_any_peptides?
343
+ shareds = protein.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.flatten.uniq.reject{|pro_id| pro_id==protein_id}
344
+ log.warn "This protein #{protein_id} shares all of its spectra with other proteins (#{shareds.join(', ')}), sharing the peptides equally (this may not be appropriate)"
345
+ end
346
+ puts [
347
+ protein_id,
348
+ protein.unique_spectra,
349
+ protein.non_unique_spectra,
350
+ protein.estimated_spectral_count,
351
+ protein.estimated_spectral_count.to_f / total_spectra,
352
+ protein.descriptive_name,
353
+ protein.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.flatten.uniq.reject{|i| i==protein_id}.join(','),
354
+ ].join "\t"
355
+ end
356
+ end
357
+
358
+
359
+
360
+
361
+
362
+