divvy_proteomics 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
4
+ data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
5
+ SHA512:
6
+ metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
7
+ data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190
data/README.md CHANGED
@@ -1,15 +1,22 @@
1
1
  # divvy_proteomics
2
2
 
3
- Takes a DTASelect CSV file, and parses the reulst
3
+ Takes a DTASelect CSV file, and parses the result so non-unique peptides get accounted for.
4
4
 
5
5
  ## Install
6
- Get ruby somehow, if you don't already have it.
6
+ Get ruby somehow, if you don't already have it. Then, install this gem:
7
7
  ```
8
- gem install divvy_spectra
8
+ $ gem install divvy_spectra
9
9
  ```
10
10
 
11
11
  ## Usage
12
12
  ```
13
+ $ divvy_spectra <DTASelectFile>
14
+ ```
15
+ Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
16
+ estimated number of spectral counts after sorting out the non-uniqueness.
17
+
18
+ Full usage information:
19
+ ```
13
20
  $ divvy_spectra -h
14
21
 
15
22
  Usage: divvy_spectra [options] <DTASelect_file>
@@ -28,7 +35,7 @@ Verbosity:
28
35
  --trace options Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG
29
36
  ```
30
37
 
31
- == Contributing to divvy\_proteomics
38
+ ## Contributing to divvy\_proteomics
32
39
 
33
40
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
34
41
  * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
@@ -38,7 +45,7 @@ Verbosity:
38
45
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
39
46
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
40
47
 
41
- == Copyright
48
+ ## Copyright
42
49
 
43
50
  Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for
44
51
  further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.1.0
data/bin/divvy_spectra CHANGED
@@ -7,6 +7,10 @@ require 'set'
7
7
 
8
8
  SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
9
9
 
10
+ rootpath = File.dirname(File.dirname(__FILE__))
11
+ $: << File.join(rootpath,'lib')
12
+ require 'dta_select_output'
13
+
10
14
  # Parse command line options into the options hash
11
15
  options = {
12
16
  :logger => 'stderr',
@@ -39,88 +43,7 @@ end
39
43
  # Setup logging
40
44
  Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
41
45
 
42
- class SelectedProtein
43
- attr_accessor :identifier
44
-
45
- attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
46
-
47
- attr_accessor :peptides
48
-
49
- def initialize
50
- @peptides = []
51
- end
52
-
53
- def unique_spectra
54
- return 0 if @peptides.nil? or @peptides.empty?
55
- num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
56
- num ||= 0
57
- return num
58
- end
59
-
60
- def non_unique_spectra
61
- return 0 if @peptides.nil? or @peptides.empty?
62
- num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
63
- num ||= 0
64
- return num
65
- end
66
-
67
- # Are there any peptides that are assigned exclusively to this protein?
68
- def uniquely_identified_by_any_peptides?
69
- unique_spectra > 0
70
- end
71
-
72
- def estimated_spectral_count
73
- # How many unique spectra are there for each protein that shares a peptide with the current peptide
74
- return 0 if @peptides.nil? or @peptides.empty?
75
- peptide_shares = []
76
- # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
77
- if !uniquely_identified_by_any_peptides?
78
- shared_parents = peptides[0].parent_proteins
79
- if peptides.find{|pep| pep.parent_proteins != shared_parents}
80
- log.warn "Found a protein (#{identifier}) that shares all its peptides with a non-constant set of proteins, hoping this is a rare event, estimated spectral count likely wrong"
81
- end
82
- num_shared_proteins = shared_parents.length
83
- num_peptide_spectra = peptides.collect{|pep| pep.redundancy}.reduce(:+)
84
- log.debug "Found #{num_shared_proteins} shared peptides and #{num_peptide_spectra} peptide spectra"
85
- return num_peptide_spectra.to_f/num_shared_proteins
86
- else
87
- peptides.each do |peptide|
88
- log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
89
- log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
90
- total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
91
- peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
92
- end
93
- return peptide_shares.reduce(:+)
94
- end
95
- end
96
-
97
- def log
98
- Bio::Log::LoggerPlus[LOG_NAME]
99
- end
100
- end
101
-
102
- class Peptide
103
- attr_accessor :identifier
104
-
105
- attr_accessor :reported_unique
106
46
 
107
- attr_accessor :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence
108
-
109
- attr_accessor :unique
110
-
111
- attr_accessor :parent_proteins
112
- def initialize
113
- @parent_proteins = []
114
- end
115
-
116
- def inspect
117
- str = "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')}]"
118
- [:identifier, :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence].each do |var|
119
- str += ", #{var}: #{send(var)}"
120
- end
121
- return str
122
- end
123
- end
124
47
 
125
48
  # Read in merges, if required
126
49
  mergers = {}
@@ -147,107 +70,12 @@ if options[:whitelist_file]
147
70
  log.info "Read in #{whitelist.length} IDs into the whitelist, only those will be reported. e.g. #{whitelist[0]}"
148
71
  end
149
72
 
73
+ # Parse the csv file
74
+ parsed = Bio::DTASelect::OutputFile.parse(ARGF)
150
75
 
151
76
  # Hashes of identifiers to objects
152
- proteins = {}
153
- hits = {}
154
-
155
- # Read in the tab separated file
156
- reading_header = true
157
- current_proteins = []
158
- last_line_was_protein_name = false
159
-
160
- # Parse each line of the DTAselect file
161
- ARGF.each_line do |line|
162
- splits = line.chomp.split("\t")
163
- log.debug "Parsing line `#{line.chomp}'"
164
-
165
- if reading_header
166
- log.debug "reading header"
167
- if splits[0] == 'Unique'
168
- reading_header = false
169
- end
170
- next
171
- end
172
-
173
- # OK, now we are reading the actual table, not the header
174
- if splits[0] != '' and splits[11].nil?
175
- ident = splits[0]
176
-
177
- if !last_line_was_protein_name
178
- # Sometimes several proteins are given all in the one header line
179
- # start a new protein
180
- log.debug "New protein now being parsed"
181
- current_proteins = []
182
- end
183
-
184
- current_protein = SelectedProtein.new
185
- last_line_was_protein_name = true
186
- current_proteins.push current_protein
187
-
188
- current_protein.identifier = ident
189
-
190
- i = 1
191
- current_protein.sequence_count = splits[i].to_i; i+=1
192
- current_protein.spectrum_count = splits[i].to_i; i+=1
193
- current_protein.sequence_coverage = splits[i].to_f; i+=1
194
- current_protein.length = splits[i].to_i; i+=1
195
- current_protein.molwt = splits[i].to_f; i+=1
196
- current_protein.pi = splits[i].to_f; i+=1
197
- current_protein.validation_status = splits[i].to_f; i+=1
198
- current_protein.descriptive_name = splits[i]
199
-
200
- if proteins[ident]
201
- raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
202
- end
203
- proteins[ident] = current_protein
204
-
205
-
206
-
207
- elsif splits[1] == 'Proteins'
208
- # Done processing, except for the bits down the bottom which aren't parsed (yet)
209
- break
210
-
211
-
212
-
213
- else
214
- log.debug "New spectra now being parsed"
215
- last_line_was_protein_name = false
216
-
217
- # Record a spectra
218
- ident = splits[1]
219
- raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
220
-
221
- pep = hits[ident]
222
- if pep.nil?
223
- pep = Peptide.new
224
- pep.identifier = ident
225
- pep.reported_unique = splits[0]
226
-
227
- i = 2
228
- pep.xcorr = splits[i].to_f; i+= 1
229
- pep.deltcn = splits[i].to_f; i+= 1
230
- pep.obs_mono_mz = splits[i].to_f; i+= 1
231
- pep.cal_mono_mz = splits[i].to_f; i+= 1
232
- pep.total_intensity = splits[i].to_f; i+= 1
233
- pep.sp_rank = splits[i].to_f; i+= 1
234
- pep.sp_score = splits[i].to_f; i+= 1
235
- pep.ion_proportion = splits[i].to_f; i+= 1
236
- pep.redundancy = splits[i].to_i; i+= 1
237
- pep.sequence = splits[i]
238
-
239
- hits[ident] = pep
240
- end
241
-
242
- current_proteins.each do |current_protein|
243
- pep.parent_proteins.push current_protein
244
- current_protein.peptides.push pep
245
- end
246
- log.debug "Parsed this peptide #{pep.inspect}"
247
- end
248
- end
249
-
250
- log.debug "Proteins parsed: #{proteins.inspect}"
77
+ proteins = parsed.protein_name_to_object
78
+ hits = parsed.peptide_name_to_object
251
79
 
252
80
 
253
81
  # Merge proteins that are known duplicates if need be
@@ -304,10 +132,23 @@ end
304
132
 
305
133
 
306
134
  # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
307
- total_contaminating_spectra = proteins.select{|ident, protein| ident.match(options[:contaminant_prefix])}.collect{|i, pro| pro.estimated_spectral_count}.reduce(:+)
308
- total_contaminating_spectra ||= 0
135
+ # Annoying thing here is when contaminating proteins share spectra
136
+ total_contaminating_peptides = hits.collect do |ident, peptide|
137
+ num_contaminating_parents = peptide.parent_proteins.select do |prot|
138
+ prot.identifier.match(options[:contaminant_prefix])
139
+ end.length
140
+
141
+ if num_contaminating_parents > 0
142
+ peptide.redundancy
143
+ else
144
+ 0
145
+ end
146
+ end
147
+ total_contaminating_spectra = total_contaminating_peptides.reduce :+
148
+ total_contaminating_spectra ||= []
149
+ log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
309
150
 
310
- total_spectra = hits.collect{|i,pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
151
+ total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
311
152
  log.info "Parsed in #{proteins.length} proteins and #{hits.length} peptides, and #{total_spectra.to_i} non-contaminating spectra"
312
153
 
313
154
  log.debug "Proteins parsed: #{proteins.inspect}"
@@ -320,9 +161,9 @@ log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_p
320
161
 
321
162
  # Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
322
163
  non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
323
- peptide.reported_unique == nil and peptide.parent_proteins.length == 1
164
+ peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
324
165
  end
325
- log.info "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one."
166
+ log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
326
167
 
327
168
  # OK, finished parsing the file. Now output the score for each protein
328
169
  puts [
@@ -334,15 +175,12 @@ puts [
334
175
  'Description',
335
176
  'Proteins sharing spectra',
336
177
  ].join "\t"
178
+ log.warn "No unique spectra found!" if total_spectra == 0
337
179
  proteins.each do |protein_id, protein|
338
180
  next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
339
181
 
340
182
  if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
341
183
  log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
342
- if !protein.uniquely_identified_by_any_peptides?
343
- shareds = protein.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.flatten.uniq.reject{|pro_id| pro_id==protein_id}
344
- log.warn "This protein #{protein_id} shares all of its spectra with other proteins (#{shareds.join(', ')}), sharing the peptides equally (this may not be appropriate)"
345
- end
346
184
  puts [
347
185
  protein_id,
348
186
  protein.unique_spectra,
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "divvy_proteomics"
8
- s.version = "0.0.1"
8
+ s.version = "0.1.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ben J Woodcroft"]
12
- s.date = "2013-04-13"
12
+ s.date = "2013-11-05"
13
13
  s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
14
14
  s.email = "donttrustben@gmail.com"
15
15
  s.executables = ["divvy_spectra"]
@@ -28,8 +28,10 @@ Gem::Specification.new do |s|
28
28
  "bin/divvy_spectra",
29
29
  "divvy_proteomics.gemspec",
30
30
  "lib/divvy_proteomics.rb",
31
+ "lib/dta_select_output.rb",
31
32
  "spec/data/merge_definition.csv",
32
33
  "spec/data/multiply_mapped_spectra.csv",
34
+ "spec/data/new_format.csv",
33
35
  "spec/data/single_protein.csv",
34
36
  "spec/data/single_protein_with_aliases.csv",
35
37
  "spec/data/three_proteins.csv",
@@ -41,11 +43,11 @@ Gem::Specification.new do |s|
41
43
  s.homepage = "http://github.com/wwood/divvy_proteomics"
42
44
  s.licenses = ["MIT"]
43
45
  s.require_paths = ["lib"]
44
- s.rubygems_version = "1.8.24"
46
+ s.rubygems_version = "2.0.3"
45
47
  s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
46
48
 
47
49
  if s.respond_to? :specification_version then
48
- s.specification_version = 3
50
+ s.specification_version = 4
49
51
 
50
52
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
51
53
  s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
@@ -0,0 +1,215 @@
1
+
2
+
3
+
4
+ module Bio::DTASelect
5
+ module Logging
6
+ def log
7
+ Bio::Log::LoggerPlus['divvy_spectra']
8
+ end
9
+ end
10
+
11
+ class OutputFile
12
+ def self.log
13
+ SelectedProtein.new.log
14
+ end
15
+
16
+ class SelectedProtein
17
+ include Bio::DTASelect::Logging
18
+
19
+ attr_accessor :identifier
20
+
21
+ attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
22
+
23
+ attr_accessor :peptides
24
+
25
+ def initialize
26
+ @peptides = []
27
+ end
28
+
29
+ def unique_spectra
30
+ return 0 if @peptides.nil? or @peptides.empty?
31
+ num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
32
+ num ||= 0
33
+ return num
34
+ end
35
+
36
+ def non_unique_spectra
37
+ return 0 if @peptides.nil? or @peptides.empty?
38
+ num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
39
+ num ||= 0
40
+ return num
41
+ end
42
+
43
+ # Are there any peptides that are assigned exclusively to this protein?
44
+ def uniquely_identified_by_any_peptides?
45
+ unique_spectra > 0
46
+ end
47
+
48
+ def estimated_spectral_count
49
+ # How many unique spectra are there for each protein that shares a peptide with the current peptide
50
+ return 0 if @peptides.nil? or @peptides.empty?
51
+ peptide_shares = []
52
+ # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
53
+ if !uniquely_identified_by_any_peptides?
54
+ # Don't attempt to divvy these up, because there are too many assumptions involved
55
+ return 0
56
+ else
57
+ peptides.each do |peptide|
58
+ log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
59
+ log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
60
+ total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
61
+ peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
62
+ end
63
+ return peptide_shares.reduce(:+)
64
+ end
65
+ end
66
+
67
+ def log
68
+ Bio::Log::LoggerPlus[LOG_NAME]
69
+ end
70
+ end
71
+
72
+ class Peptide
73
+ include Bio::DTASelect::Logging
74
+
75
+ attr_accessor :identifier
76
+
77
+ # Hash of column names to values. These are different for different DTAselect output files, it seems.
78
+ attr_accessor :dtaselect_attributes
79
+
80
+ # Array of proteins that have this peptide associated
81
+ attr_accessor :parent_proteins
82
+
83
+ def initialize
84
+ @parent_proteins = []
85
+ end
86
+
87
+ def inspect
88
+ "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')} @identifier: #{identifier}, @attributes: #{dtaselect_attributes.inspect}]"
89
+ end
90
+
91
+ def redundancy
92
+ @dtaselect_attributes['Redundancy'].to_i
93
+ end
94
+
95
+ def reported_unique?
96
+ dtaselect_attributes.length == 1
97
+ end
98
+ end
99
+
100
+ class Result
101
+ include Bio::DTASelect::Logging
102
+
103
+ # hash of protein identifier to Protein object
104
+ attr_accessor :protein_name_to_object
105
+
106
+ # hash of peptide identifier to Peptide object
107
+ attr_accessor :peptide_name_to_object
108
+ end
109
+
110
+ def self.parse(io)
111
+ result = Result.new
112
+
113
+ # Hashes of identifiers to objects
114
+ result.protein_name_to_object = {}
115
+ result.peptide_name_to_object = {}
116
+
117
+ # Read in the tab separated file
118
+ reading_header = true
119
+ current_proteins = []
120
+ last_line_was_protein_name = false
121
+ peptide_attribute_names = nil
122
+
123
+ # Parse each line of the DTAselect file
124
+ io.each_line do |line|
125
+ splits = line.chomp.split("\t")
126
+ log.debug "Parsing line `#{line.chomp}'"
127
+
128
+ if reading_header
129
+ log.debug "reading header"
130
+ if splits[0] == 'Unique'
131
+ reading_header = false
132
+
133
+ # Current line describes the peptide attributes
134
+ peptide_attribute_names = splits
135
+
136
+ # This field has special importance, so be picky
137
+ raise "Badly parsed file at this line: #{line.inspect}, expected 2nd field to be 'FileName', found #{splits[1]}" unless splits[1] == 'FileName'
138
+ end
139
+ next
140
+ end
141
+
142
+ # OK, now we are reading the actual table, not the header
143
+ if splits[0] != '' and splits[11].nil?
144
+ ident = splits[0]
145
+
146
+ if !last_line_was_protein_name
147
+ # Sometimes several proteins are given all in the one header line
148
+ # start a new protein
149
+ log.debug "New protein now being parsed"
150
+ current_proteins = []
151
+ end
152
+
153
+ current_protein = SelectedProtein.new
154
+ last_line_was_protein_name = true
155
+ current_proteins.push current_protein
156
+
157
+ current_protein.identifier = ident
158
+
159
+ i = 1
160
+ current_protein.sequence_count = splits[i].to_i; i+=1
161
+ current_protein.spectrum_count = splits[i].to_i; i+=1
162
+ current_protein.sequence_coverage = splits[i].to_f; i+=1
163
+ current_protein.length = splits[i].to_i; i+=1
164
+ current_protein.molwt = splits[i].to_f; i+=1
165
+ current_protein.pi = splits[i].to_f; i+=1
166
+ current_protein.validation_status = splits[i].to_f; i+=1
167
+ current_protein.descriptive_name = splits[i]
168
+
169
+ if result.protein_name_to_object[ident]
170
+ raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
171
+ end
172
+ result.protein_name_to_object[ident] = current_protein
173
+
174
+
175
+
176
+ elsif splits[1] == 'Proteins'
177
+ # Done processing, except for the bits down the bottom which aren't parsed (yet)
178
+ break
179
+
180
+
181
+
182
+ else
183
+ log.debug "New spectra now being parsed"
184
+ last_line_was_protein_name = false
185
+
186
+ # Record a spectra
187
+ ident = splits[1]
188
+ raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
189
+
190
+ pep = result.peptide_name_to_object[ident]
191
+ if pep.nil?
192
+ pep = Peptide.new
193
+ pep.identifier = ident
194
+
195
+ peptide_attribute_names.each_with_index do |attribute_name,i|
196
+ pep.dtaselect_attributes ||= {}
197
+ pep.dtaselect_attributes[attribute_name] = splits[i]
198
+ end
199
+
200
+ result.peptide_name_to_object[ident] = pep
201
+ end
202
+
203
+ current_proteins.each do |current_protein|
204
+ pep.parent_proteins.push current_protein
205
+ current_protein.peptides.push pep
206
+ end
207
+ log.debug "Parsed this peptide #{pep.inspect}"
208
+ end
209
+ end
210
+
211
+ log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
212
+ return result
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,49 @@
1
+ DTASelect v1.9
2
+ /auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
3
+ /auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
4
+ SEQUEST v.27 in SQT format.
5
+ --DB -p 2 -r 1000
6
+ true Use criteria
7
+ 1.8 Minimum +1 XCorr
8
+ 2.5 Minimum +2 XCorr
9
+ 3.5 Minimum +3 XCorr
10
+ 0.08 Minimum DeltCN
11
+ 1 Minimum charge state
12
+ 3 Maximum charge state
13
+ 0.0 Minimum ion proportion
14
+ 1000 Maximum Sp rank
15
+ -1.0 Minimum Sp score
16
+ Include Modified peptide inclusion
17
+ Any Tryptic status requirement
18
+ true Multiple, ambiguous IDs allowed
19
+ Ignore Peptide validation handling
20
+ XCorr Purge duplicate peptides by protein
21
+ false Include only loci with unique peptide
22
+ false Remove subset proteins
23
+ Ignore Locus validation handling
24
+ 0 Minimum modified peptides per locus
25
+ 1000 Minimum redundancy for low coverage loci
26
+ 2 Minimum peptides per locus
27
+
28
+ Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
29
+ Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
30
+ E1D_raw_1__154436_3 4 58 81.9% 72 7500 7.3 U # 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
31
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
32
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
33
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
34
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_04.07913.07913.2 4.8897 0.2913 1384.7232 1384.7250 -1.2769 -0.0018 5676.4 1 1767.3 91.7 17 K.MQAQIGGLNQAVR.N
35
+ E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
36
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
37
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
38
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
39
+ E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
40
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
41
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
42
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
43
+ Proteins Peptide IDs Copies
44
+ Unfiltered 318515 400116 506301
45
+ Redundant 1575 3555 18759
46
+ Nonredundant 1211 2557 12384
47
+
48
+ Classification Nonredundant Proteins Redundant Proteins
49
+ Unclassified 0 0
@@ -83,8 +83,8 @@ describe script_under_test do
83
83
 
84
84
  stderr.should eq("")
85
85
  answer = header+
86
- ['Mstor_v4.3.2:1344','0','188','94.0','0.5','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")+
87
- ['alias1','0','188','94.0','0.5','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
86
+ ['Mstor_v4.3.2:1344','0','188','0','0.0','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")+
87
+ ['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
88
88
  stdout.should eq(answer), test_file
89
89
  end
90
90
 
@@ -103,4 +103,16 @@ describe script_under_test do
103
103
  stdout.should eq(answer)
104
104
  end
105
105
  end
106
+
107
+ it 'should work with the newer file format, wherever that came from' do
108
+ test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format.csv --trace error"
109
+ status, stdout, stderr = systemu test_file
110
+
111
+ stderr.should eq("")
112
+ answer = header+
113
+ ['E1D_raw_1__154436_3','17','41','58.0','0.8787878787878788','# 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
114
+ ['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154436_3'+"\n"].join("\t")+
115
+ ['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
116
+ stdout.should eq(answer)
117
+ end
106
118
  end
metadata CHANGED
@@ -1,110 +1,97 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: divvy_proteomics
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Ben J Woodcroft
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-04-13 00:00:00.000000000 Z
11
+ date: 2013-11-05 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: bio-logger
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: systemu
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: rspec
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - '>='
52
46
  - !ruby/object:Gem::Version
53
47
  version: 2.8.0
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - '>='
60
53
  - !ruby/object:Gem::Version
61
54
  version: 2.8.0
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: rdoc
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - '>='
68
60
  - !ruby/object:Gem::Version
69
61
  version: '3.12'
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - '>='
76
67
  - !ruby/object:Gem::Version
77
68
  version: '3.12'
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: bundler
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
- - - ! '>='
73
+ - - '>='
84
74
  - !ruby/object:Gem::Version
85
75
  version: 1.0.0
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
- - - ! '>='
80
+ - - '>='
92
81
  - !ruby/object:Gem::Version
93
82
  version: 1.0.0
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: jeweler
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ! '>='
87
+ - - '>='
100
88
  - !ruby/object:Gem::Version
101
89
  version: 1.8.4
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ! '>='
94
+ - - '>='
108
95
  - !ruby/object:Gem::Version
109
96
  version: 1.8.4
110
97
  description: divvy up spectra from DTASelect files in a somewhat parsimonious way
@@ -126,8 +113,10 @@ files:
126
113
  - bin/divvy_spectra
127
114
  - divvy_proteomics.gemspec
128
115
  - lib/divvy_proteomics.rb
116
+ - lib/dta_select_output.rb
129
117
  - spec/data/merge_definition.csv
130
118
  - spec/data/multiply_mapped_spectra.csv
119
+ - spec/data/new_format.csv
131
120
  - spec/data/single_protein.csv
132
121
  - spec/data/single_protein_with_aliases.csv
133
122
  - spec/data/three_proteins.csv
@@ -138,29 +127,25 @@ files:
138
127
  homepage: http://github.com/wwood/divvy_proteomics
139
128
  licenses:
140
129
  - MIT
130
+ metadata: {}
141
131
  post_install_message:
142
132
  rdoc_options: []
143
133
  require_paths:
144
134
  - lib
145
135
  required_ruby_version: !ruby/object:Gem::Requirement
146
- none: false
147
136
  requirements:
148
- - - ! '>='
137
+ - - '>='
149
138
  - !ruby/object:Gem::Version
150
139
  version: '0'
151
- segments:
152
- - 0
153
- hash: -659530255
154
140
  required_rubygems_version: !ruby/object:Gem::Requirement
155
- none: false
156
141
  requirements:
157
- - - ! '>='
142
+ - - '>='
158
143
  - !ruby/object:Gem::Version
159
144
  version: '0'
160
145
  requirements: []
161
146
  rubyforge_project:
162
- rubygems_version: 1.8.24
147
+ rubygems_version: 2.0.3
163
148
  signing_key:
164
- specification_version: 3
149
+ specification_version: 4
165
150
  summary: divvy up spectra from DTASelect files in a parsimonious way
166
151
  test_files: []