divvy_proteomics 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +12 -5
- data/VERSION +1 -1
- data/bin/divvy_spectra +27 -189
- data/divvy_proteomics.gemspec +6 -4
- data/lib/dta_select_output.rb +215 -0
- data/spec/data/new_format.csv +49 -0
- data/spec/divvy_proteomics_spec.rb +14 -2
- metadata +21 -36
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
|
4
|
+
data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
|
7
|
+
data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190
|
data/README.md
CHANGED
@@ -1,15 +1,22 @@
|
|
1
1
|
# divvy_proteomics
|
2
2
|
|
3
|
-
Takes a DTASelect CSV file, and parses the
|
3
|
+
Takes a DTASelect CSV file, and parses the result so non-unique peptides get accounted for.
|
4
4
|
|
5
5
|
## Install
|
6
|
-
Get ruby somehow, if you don't already have it.
|
6
|
+
Get ruby somehow, if you don't already have it. Then, install this gem:
|
7
7
|
```
|
8
|
-
gem install divvy_spectra
|
8
|
+
$ gem install divvy_spectra
|
9
9
|
```
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
```
|
13
|
+
$ divvy_spectra <DTASelectFile>
|
14
|
+
```
|
15
|
+
Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
|
16
|
+
estimated number of spectral counts after sorting out the non-uniqueness.
|
17
|
+
|
18
|
+
Full usage information:
|
19
|
+
```
|
13
20
|
$ divvy_spectra -h
|
14
21
|
|
15
22
|
Usage: divvy_spectra [options] <DTASelect_file>
|
@@ -28,7 +35,7 @@ Verbosity:
|
|
28
35
|
--trace options Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG
|
29
36
|
```
|
30
37
|
|
31
|
-
|
38
|
+
## Contributing to divvy\_proteomics
|
32
39
|
|
33
40
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
34
41
|
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
@@ -38,7 +45,7 @@ Verbosity:
|
|
38
45
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
39
46
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
40
47
|
|
41
|
-
|
48
|
+
## Copyright
|
42
49
|
|
43
50
|
Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for
|
44
51
|
further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/bin/divvy_spectra
CHANGED
@@ -7,6 +7,10 @@ require 'set'
|
|
7
7
|
|
8
8
|
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
9
9
|
|
10
|
+
rootpath = File.dirname(File.dirname(__FILE__))
|
11
|
+
$: << File.join(rootpath,'lib')
|
12
|
+
require 'dta_select_output'
|
13
|
+
|
10
14
|
# Parse command line options into the options hash
|
11
15
|
options = {
|
12
16
|
:logger => 'stderr',
|
@@ -39,88 +43,7 @@ end
|
|
39
43
|
# Setup logging
|
40
44
|
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
41
45
|
|
42
|
-
class SelectedProtein
|
43
|
-
attr_accessor :identifier
|
44
|
-
|
45
|
-
attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
|
46
|
-
|
47
|
-
attr_accessor :peptides
|
48
|
-
|
49
|
-
def initialize
|
50
|
-
@peptides = []
|
51
|
-
end
|
52
|
-
|
53
|
-
def unique_spectra
|
54
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
55
|
-
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
56
|
-
num ||= 0
|
57
|
-
return num
|
58
|
-
end
|
59
|
-
|
60
|
-
def non_unique_spectra
|
61
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
62
|
-
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
63
|
-
num ||= 0
|
64
|
-
return num
|
65
|
-
end
|
66
|
-
|
67
|
-
# Are there any peptides that are assigned exclusively to this protein?
|
68
|
-
def uniquely_identified_by_any_peptides?
|
69
|
-
unique_spectra > 0
|
70
|
-
end
|
71
|
-
|
72
|
-
def estimated_spectral_count
|
73
|
-
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
74
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
75
|
-
peptide_shares = []
|
76
|
-
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
77
|
-
if !uniquely_identified_by_any_peptides?
|
78
|
-
shared_parents = peptides[0].parent_proteins
|
79
|
-
if peptides.find{|pep| pep.parent_proteins != shared_parents}
|
80
|
-
log.warn "Found a protein (#{identifier}) that shares all its peptides with a non-constant set of proteins, hoping this is a rare event, estimated spectral count likely wrong"
|
81
|
-
end
|
82
|
-
num_shared_proteins = shared_parents.length
|
83
|
-
num_peptide_spectra = peptides.collect{|pep| pep.redundancy}.reduce(:+)
|
84
|
-
log.debug "Found #{num_shared_proteins} shared peptides and #{num_peptide_spectra} peptide spectra"
|
85
|
-
return num_peptide_spectra.to_f/num_shared_proteins
|
86
|
-
else
|
87
|
-
peptides.each do |peptide|
|
88
|
-
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
89
|
-
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
90
|
-
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
91
|
-
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
92
|
-
end
|
93
|
-
return peptide_shares.reduce(:+)
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
def log
|
98
|
-
Bio::Log::LoggerPlus[LOG_NAME]
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
class Peptide
|
103
|
-
attr_accessor :identifier
|
104
|
-
|
105
|
-
attr_accessor :reported_unique
|
106
46
|
|
107
|
-
attr_accessor :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence
|
108
|
-
|
109
|
-
attr_accessor :unique
|
110
|
-
|
111
|
-
attr_accessor :parent_proteins
|
112
|
-
def initialize
|
113
|
-
@parent_proteins = []
|
114
|
-
end
|
115
|
-
|
116
|
-
def inspect
|
117
|
-
str = "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')}]"
|
118
|
-
[:identifier, :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence].each do |var|
|
119
|
-
str += ", #{var}: #{send(var)}"
|
120
|
-
end
|
121
|
-
return str
|
122
|
-
end
|
123
|
-
end
|
124
47
|
|
125
48
|
# Read in merges, if required
|
126
49
|
mergers = {}
|
@@ -147,107 +70,12 @@ if options[:whitelist_file]
|
|
147
70
|
log.info "Read in #{whitelist.length} IDs into the whitelist, only those will be reported. e.g. #{whitelist[0]}"
|
148
71
|
end
|
149
72
|
|
73
|
+
# Parse the csv file
|
74
|
+
parsed = Bio::DTASelect::OutputFile.parse(ARGF)
|
150
75
|
|
151
76
|
# Hashes of identifiers to objects
|
152
|
-
proteins =
|
153
|
-
hits =
|
154
|
-
|
155
|
-
# Read in the tab separated file
|
156
|
-
reading_header = true
|
157
|
-
current_proteins = []
|
158
|
-
last_line_was_protein_name = false
|
159
|
-
|
160
|
-
# Parse each line of the DTAselect file
|
161
|
-
ARGF.each_line do |line|
|
162
|
-
splits = line.chomp.split("\t")
|
163
|
-
log.debug "Parsing line `#{line.chomp}'"
|
164
|
-
|
165
|
-
if reading_header
|
166
|
-
log.debug "reading header"
|
167
|
-
if splits[0] == 'Unique'
|
168
|
-
reading_header = false
|
169
|
-
end
|
170
|
-
next
|
171
|
-
end
|
172
|
-
|
173
|
-
# OK, now we are reading the actual table, not the header
|
174
|
-
if splits[0] != '' and splits[11].nil?
|
175
|
-
ident = splits[0]
|
176
|
-
|
177
|
-
if !last_line_was_protein_name
|
178
|
-
# Sometimes several proteins are given all in the one header line
|
179
|
-
# start a new protein
|
180
|
-
log.debug "New protein now being parsed"
|
181
|
-
current_proteins = []
|
182
|
-
end
|
183
|
-
|
184
|
-
current_protein = SelectedProtein.new
|
185
|
-
last_line_was_protein_name = true
|
186
|
-
current_proteins.push current_protein
|
187
|
-
|
188
|
-
current_protein.identifier = ident
|
189
|
-
|
190
|
-
i = 1
|
191
|
-
current_protein.sequence_count = splits[i].to_i; i+=1
|
192
|
-
current_protein.spectrum_count = splits[i].to_i; i+=1
|
193
|
-
current_protein.sequence_coverage = splits[i].to_f; i+=1
|
194
|
-
current_protein.length = splits[i].to_i; i+=1
|
195
|
-
current_protein.molwt = splits[i].to_f; i+=1
|
196
|
-
current_protein.pi = splits[i].to_f; i+=1
|
197
|
-
current_protein.validation_status = splits[i].to_f; i+=1
|
198
|
-
current_protein.descriptive_name = splits[i]
|
199
|
-
|
200
|
-
if proteins[ident]
|
201
|
-
raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
|
202
|
-
end
|
203
|
-
proteins[ident] = current_protein
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
elsif splits[1] == 'Proteins'
|
208
|
-
# Done processing, except for the bits down the bottom which aren't parsed (yet)
|
209
|
-
break
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
else
|
214
|
-
log.debug "New spectra now being parsed"
|
215
|
-
last_line_was_protein_name = false
|
216
|
-
|
217
|
-
# Record a spectra
|
218
|
-
ident = splits[1]
|
219
|
-
raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
|
220
|
-
|
221
|
-
pep = hits[ident]
|
222
|
-
if pep.nil?
|
223
|
-
pep = Peptide.new
|
224
|
-
pep.identifier = ident
|
225
|
-
pep.reported_unique = splits[0]
|
226
|
-
|
227
|
-
i = 2
|
228
|
-
pep.xcorr = splits[i].to_f; i+= 1
|
229
|
-
pep.deltcn = splits[i].to_f; i+= 1
|
230
|
-
pep.obs_mono_mz = splits[i].to_f; i+= 1
|
231
|
-
pep.cal_mono_mz = splits[i].to_f; i+= 1
|
232
|
-
pep.total_intensity = splits[i].to_f; i+= 1
|
233
|
-
pep.sp_rank = splits[i].to_f; i+= 1
|
234
|
-
pep.sp_score = splits[i].to_f; i+= 1
|
235
|
-
pep.ion_proportion = splits[i].to_f; i+= 1
|
236
|
-
pep.redundancy = splits[i].to_i; i+= 1
|
237
|
-
pep.sequence = splits[i]
|
238
|
-
|
239
|
-
hits[ident] = pep
|
240
|
-
end
|
241
|
-
|
242
|
-
current_proteins.each do |current_protein|
|
243
|
-
pep.parent_proteins.push current_protein
|
244
|
-
current_protein.peptides.push pep
|
245
|
-
end
|
246
|
-
log.debug "Parsed this peptide #{pep.inspect}"
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
log.debug "Proteins parsed: #{proteins.inspect}"
|
77
|
+
proteins = parsed.protein_name_to_object
|
78
|
+
hits = parsed.peptide_name_to_object
|
251
79
|
|
252
80
|
|
253
81
|
# Merge proteins that are known duplicates if need be
|
@@ -304,10 +132,23 @@ end
|
|
304
132
|
|
305
133
|
|
306
134
|
# Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
|
307
|
-
|
308
|
-
|
135
|
+
# Annoying thing here is when contaminating proteins share spectra
|
136
|
+
total_contaminating_peptides = hits.collect do |ident, peptide|
|
137
|
+
num_contaminating_parents = peptide.parent_proteins.select do |prot|
|
138
|
+
prot.identifier.match(options[:contaminant_prefix])
|
139
|
+
end.length
|
140
|
+
|
141
|
+
if num_contaminating_parents > 0
|
142
|
+
peptide.redundancy
|
143
|
+
else
|
144
|
+
0
|
145
|
+
end
|
146
|
+
end
|
147
|
+
total_contaminating_spectra = total_contaminating_peptides.reduce :+
|
148
|
+
total_contaminating_spectra ||= []
|
149
|
+
log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
|
309
150
|
|
310
|
-
total_spectra = hits.collect{|
|
151
|
+
total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
|
311
152
|
log.info "Parsed in #{proteins.length} proteins and #{hits.length} peptides, and #{total_spectra.to_i} non-contaminating spectra"
|
312
153
|
|
313
154
|
log.debug "Proteins parsed: #{proteins.inspect}"
|
@@ -320,9 +161,9 @@ log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_p
|
|
320
161
|
|
321
162
|
# Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
|
322
163
|
non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
|
323
|
-
peptide.
|
164
|
+
peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
|
324
165
|
end
|
325
|
-
log.
|
166
|
+
log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
|
326
167
|
|
327
168
|
# OK, finished parsing the file. Now output the score for each protein
|
328
169
|
puts [
|
@@ -334,15 +175,12 @@ puts [
|
|
334
175
|
'Description',
|
335
176
|
'Proteins sharing spectra',
|
336
177
|
].join "\t"
|
178
|
+
log.warn "No unique spectra found!" if total_spectra == 0
|
337
179
|
proteins.each do |protein_id, protein|
|
338
180
|
next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
|
339
181
|
|
340
182
|
if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
|
341
183
|
log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
|
342
|
-
if !protein.uniquely_identified_by_any_peptides?
|
343
|
-
shareds = protein.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.flatten.uniq.reject{|pro_id| pro_id==protein_id}
|
344
|
-
log.warn "This protein #{protein_id} shares all of its spectra with other proteins (#{shareds.join(', ')}), sharing the peptides equally (this may not be appropriate)"
|
345
|
-
end
|
346
184
|
puts [
|
347
185
|
protein_id,
|
348
186
|
protein.unique_spectra,
|
data/divvy_proteomics.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "divvy_proteomics"
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "0.1.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-11-05"
|
13
13
|
s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
|
14
14
|
s.email = "donttrustben@gmail.com"
|
15
15
|
s.executables = ["divvy_spectra"]
|
@@ -28,8 +28,10 @@ Gem::Specification.new do |s|
|
|
28
28
|
"bin/divvy_spectra",
|
29
29
|
"divvy_proteomics.gemspec",
|
30
30
|
"lib/divvy_proteomics.rb",
|
31
|
+
"lib/dta_select_output.rb",
|
31
32
|
"spec/data/merge_definition.csv",
|
32
33
|
"spec/data/multiply_mapped_spectra.csv",
|
34
|
+
"spec/data/new_format.csv",
|
33
35
|
"spec/data/single_protein.csv",
|
34
36
|
"spec/data/single_protein_with_aliases.csv",
|
35
37
|
"spec/data/three_proteins.csv",
|
@@ -41,11 +43,11 @@ Gem::Specification.new do |s|
|
|
41
43
|
s.homepage = "http://github.com/wwood/divvy_proteomics"
|
42
44
|
s.licenses = ["MIT"]
|
43
45
|
s.require_paths = ["lib"]
|
44
|
-
s.rubygems_version = "
|
46
|
+
s.rubygems_version = "2.0.3"
|
45
47
|
s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
|
46
48
|
|
47
49
|
if s.respond_to? :specification_version then
|
48
|
-
s.specification_version =
|
50
|
+
s.specification_version = 4
|
49
51
|
|
50
52
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
51
53
|
s.add_runtime_dependency(%q<bio-logger>, [">= 0"])
|
@@ -0,0 +1,215 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
|
4
|
+
module Bio::DTASelect
|
5
|
+
module Logging
|
6
|
+
def log
|
7
|
+
Bio::Log::LoggerPlus['divvy_spectra']
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class OutputFile
|
12
|
+
def self.log
|
13
|
+
SelectedProtein.new.log
|
14
|
+
end
|
15
|
+
|
16
|
+
class SelectedProtein
|
17
|
+
include Bio::DTASelect::Logging
|
18
|
+
|
19
|
+
attr_accessor :identifier
|
20
|
+
|
21
|
+
attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
|
22
|
+
|
23
|
+
attr_accessor :peptides
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@peptides = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def unique_spectra
|
30
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
31
|
+
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
32
|
+
num ||= 0
|
33
|
+
return num
|
34
|
+
end
|
35
|
+
|
36
|
+
def non_unique_spectra
|
37
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
38
|
+
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
39
|
+
num ||= 0
|
40
|
+
return num
|
41
|
+
end
|
42
|
+
|
43
|
+
# Are there any peptides that are assigned exclusively to this protein?
|
44
|
+
def uniquely_identified_by_any_peptides?
|
45
|
+
unique_spectra > 0
|
46
|
+
end
|
47
|
+
|
48
|
+
def estimated_spectral_count
|
49
|
+
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
50
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
51
|
+
peptide_shares = []
|
52
|
+
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
53
|
+
if !uniquely_identified_by_any_peptides?
|
54
|
+
# Don't attempt to divvy these up, because there are too many assumptions involved
|
55
|
+
return 0
|
56
|
+
else
|
57
|
+
peptides.each do |peptide|
|
58
|
+
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
59
|
+
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
60
|
+
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
61
|
+
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
62
|
+
end
|
63
|
+
return peptide_shares.reduce(:+)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def log
|
68
|
+
Bio::Log::LoggerPlus[LOG_NAME]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Peptide
|
73
|
+
include Bio::DTASelect::Logging
|
74
|
+
|
75
|
+
attr_accessor :identifier
|
76
|
+
|
77
|
+
# Hash of column names to values. These are different for different DTAselect output files, it seems.
|
78
|
+
attr_accessor :dtaselect_attributes
|
79
|
+
|
80
|
+
# Array of proteins that have this peptide associated
|
81
|
+
attr_accessor :parent_proteins
|
82
|
+
|
83
|
+
def initialize
|
84
|
+
@parent_proteins = []
|
85
|
+
end
|
86
|
+
|
87
|
+
def inspect
|
88
|
+
"Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')} @identifier: #{identifier}, @attributes: #{dtaselect_attributes.inspect}]"
|
89
|
+
end
|
90
|
+
|
91
|
+
def redundancy
|
92
|
+
@dtaselect_attributes['Redundancy'].to_i
|
93
|
+
end
|
94
|
+
|
95
|
+
def reported_unique?
|
96
|
+
dtaselect_attributes.length == 1
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class Result
|
101
|
+
include Bio::DTASelect::Logging
|
102
|
+
|
103
|
+
# hash of protein identifier to Protein object
|
104
|
+
attr_accessor :protein_name_to_object
|
105
|
+
|
106
|
+
# hash of peptide identifier to Peptide object
|
107
|
+
attr_accessor :peptide_name_to_object
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.parse(io)
|
111
|
+
result = Result.new
|
112
|
+
|
113
|
+
# Hashes of identifiers to objects
|
114
|
+
result.protein_name_to_object = {}
|
115
|
+
result.peptide_name_to_object = {}
|
116
|
+
|
117
|
+
# Read in the tab separated file
|
118
|
+
reading_header = true
|
119
|
+
current_proteins = []
|
120
|
+
last_line_was_protein_name = false
|
121
|
+
peptide_attribute_names = nil
|
122
|
+
|
123
|
+
# Parse each line of the DTAselect file
|
124
|
+
io.each_line do |line|
|
125
|
+
splits = line.chomp.split("\t")
|
126
|
+
log.debug "Parsing line `#{line.chomp}'"
|
127
|
+
|
128
|
+
if reading_header
|
129
|
+
log.debug "reading header"
|
130
|
+
if splits[0] == 'Unique'
|
131
|
+
reading_header = false
|
132
|
+
|
133
|
+
# Current line describes the peptide attributes
|
134
|
+
peptide_attribute_names = splits
|
135
|
+
|
136
|
+
# This field has special importance, so be picky
|
137
|
+
raise "Badly parsed file at this line: #{line.inspect}, expected 2nd field to be 'FileName', found #{splits[1]}" unless splits[1] == 'FileName'
|
138
|
+
end
|
139
|
+
next
|
140
|
+
end
|
141
|
+
|
142
|
+
# OK, now we are reading the actual table, not the header
|
143
|
+
if splits[0] != '' and splits[11].nil?
|
144
|
+
ident = splits[0]
|
145
|
+
|
146
|
+
if !last_line_was_protein_name
|
147
|
+
# Sometimes several proteins are given all in the one header line
|
148
|
+
# start a new protein
|
149
|
+
log.debug "New protein now being parsed"
|
150
|
+
current_proteins = []
|
151
|
+
end
|
152
|
+
|
153
|
+
current_protein = SelectedProtein.new
|
154
|
+
last_line_was_protein_name = true
|
155
|
+
current_proteins.push current_protein
|
156
|
+
|
157
|
+
current_protein.identifier = ident
|
158
|
+
|
159
|
+
i = 1
|
160
|
+
current_protein.sequence_count = splits[i].to_i; i+=1
|
161
|
+
current_protein.spectrum_count = splits[i].to_i; i+=1
|
162
|
+
current_protein.sequence_coverage = splits[i].to_f; i+=1
|
163
|
+
current_protein.length = splits[i].to_i; i+=1
|
164
|
+
current_protein.molwt = splits[i].to_f; i+=1
|
165
|
+
current_protein.pi = splits[i].to_f; i+=1
|
166
|
+
current_protein.validation_status = splits[i].to_f; i+=1
|
167
|
+
current_protein.descriptive_name = splits[i]
|
168
|
+
|
169
|
+
if result.protein_name_to_object[ident]
|
170
|
+
raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
|
171
|
+
end
|
172
|
+
result.protein_name_to_object[ident] = current_protein
|
173
|
+
|
174
|
+
|
175
|
+
|
176
|
+
elsif splits[1] == 'Proteins'
|
177
|
+
# Done processing, except for the bits down the bottom which aren't parsed (yet)
|
178
|
+
break
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
else
|
183
|
+
log.debug "New spectra now being parsed"
|
184
|
+
last_line_was_protein_name = false
|
185
|
+
|
186
|
+
# Record a spectra
|
187
|
+
ident = splits[1]
|
188
|
+
raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
|
189
|
+
|
190
|
+
pep = result.peptide_name_to_object[ident]
|
191
|
+
if pep.nil?
|
192
|
+
pep = Peptide.new
|
193
|
+
pep.identifier = ident
|
194
|
+
|
195
|
+
peptide_attribute_names.each_with_index do |attribute_name,i|
|
196
|
+
pep.dtaselect_attributes ||= {}
|
197
|
+
pep.dtaselect_attributes[attribute_name] = splits[i]
|
198
|
+
end
|
199
|
+
|
200
|
+
result.peptide_name_to_object[ident] = pep
|
201
|
+
end
|
202
|
+
|
203
|
+
current_proteins.each do |current_protein|
|
204
|
+
pep.parent_proteins.push current_protein
|
205
|
+
current_protein.peptides.push pep
|
206
|
+
end
|
207
|
+
log.debug "Parsed this peptide #{pep.inspect}"
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
|
212
|
+
return result
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
DTASelect v1.9
|
2
|
+
/auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
|
3
|
+
/auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
|
4
|
+
SEQUEST v.27 in SQT format.
|
5
|
+
--DB -p 2 -r 1000
|
6
|
+
true Use criteria
|
7
|
+
1.8 Minimum +1 XCorr
|
8
|
+
2.5 Minimum +2 XCorr
|
9
|
+
3.5 Minimum +3 XCorr
|
10
|
+
0.08 Minimum DeltCN
|
11
|
+
1 Minimum charge state
|
12
|
+
3 Maximum charge state
|
13
|
+
0.0 Minimum ion proportion
|
14
|
+
1000 Maximum Sp rank
|
15
|
+
-1.0 Minimum Sp score
|
16
|
+
Include Modified peptide inclusion
|
17
|
+
Any Tryptic status requirement
|
18
|
+
true Multiple, ambiguous IDs allowed
|
19
|
+
Ignore Peptide validation handling
|
20
|
+
XCorr Purge duplicate peptides by protein
|
21
|
+
false Include only loci with unique peptide
|
22
|
+
false Remove subset proteins
|
23
|
+
Ignore Locus validation handling
|
24
|
+
0 Minimum modified peptides per locus
|
25
|
+
1000 Minimum redundancy for low coverage loci
|
26
|
+
2 Minimum peptides per locus
|
27
|
+
|
28
|
+
Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
|
29
|
+
Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
|
30
|
+
E1D_raw_1__154436_3 4 58 81.9% 72 7500 7.3 U # 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
31
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
|
32
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
|
33
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
|
34
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_04.07913.07913.2 4.8897 0.2913 1384.7232 1384.7250 -1.2769 -0.0018 5676.4 1 1767.3 91.7 17 K.MQAQIGGLNQAVR.N
|
35
|
+
E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
36
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
|
37
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
|
38
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
|
39
|
+
E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
|
40
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
|
41
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
|
42
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
|
43
|
+
Proteins Peptide IDs Copies
|
44
|
+
Unfiltered 318515 400116 506301
|
45
|
+
Redundant 1575 3555 18759
|
46
|
+
Nonredundant 1211 2557 12384
|
47
|
+
|
48
|
+
Classification Nonredundant Proteins Redundant Proteins
|
49
|
+
Unclassified 0 0
|
@@ -83,8 +83,8 @@ describe script_under_test do
|
|
83
83
|
|
84
84
|
stderr.should eq("")
|
85
85
|
answer = header+
|
86
|
-
['Mstor_v4.3.2:1344','0','188','
|
87
|
-
['alias1','0','188','
|
86
|
+
['Mstor_v4.3.2:1344','0','188','0','0.0','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")+
|
87
|
+
['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
|
88
88
|
stdout.should eq(answer), test_file
|
89
89
|
end
|
90
90
|
|
@@ -103,4 +103,16 @@ describe script_under_test do
|
|
103
103
|
stdout.should eq(answer)
|
104
104
|
end
|
105
105
|
end
|
106
|
+
|
107
|
+
it 'should work with the newer file format, wherever that came from' do
|
108
|
+
test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format.csv --trace error"
|
109
|
+
status, stdout, stderr = systemu test_file
|
110
|
+
|
111
|
+
stderr.should eq("")
|
112
|
+
answer = header+
|
113
|
+
['E1D_raw_1__154436_3','17','41','58.0','0.8787878787878788','# 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
|
114
|
+
['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154436_3'+"\n"].join("\t")+
|
115
|
+
['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
|
116
|
+
stdout.should eq(answer)
|
117
|
+
end
|
106
118
|
end
|
metadata
CHANGED
@@ -1,110 +1,97 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: divvy_proteomics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Ben J Woodcroft
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-11-05 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bio-logger
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: systemu
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: rspec
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - '>='
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: 2.8.0
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: 2.8.0
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: rdoc
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - '>='
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '3.12'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - '>='
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '3.12'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: bundler
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - '>='
|
84
74
|
- !ruby/object:Gem::Version
|
85
75
|
version: 1.0.0
|
86
76
|
type: :development
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - '>='
|
92
81
|
- !ruby/object:Gem::Version
|
93
82
|
version: 1.0.0
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: jeweler
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - '>='
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: 1.8.4
|
102
90
|
type: :development
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - '>='
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: 1.8.4
|
110
97
|
description: divvy up spectra from DTASelect files in a somewhat parsimonious way
|
@@ -126,8 +113,10 @@ files:
|
|
126
113
|
- bin/divvy_spectra
|
127
114
|
- divvy_proteomics.gemspec
|
128
115
|
- lib/divvy_proteomics.rb
|
116
|
+
- lib/dta_select_output.rb
|
129
117
|
- spec/data/merge_definition.csv
|
130
118
|
- spec/data/multiply_mapped_spectra.csv
|
119
|
+
- spec/data/new_format.csv
|
131
120
|
- spec/data/single_protein.csv
|
132
121
|
- spec/data/single_protein_with_aliases.csv
|
133
122
|
- spec/data/three_proteins.csv
|
@@ -138,29 +127,25 @@ files:
|
|
138
127
|
homepage: http://github.com/wwood/divvy_proteomics
|
139
128
|
licenses:
|
140
129
|
- MIT
|
130
|
+
metadata: {}
|
141
131
|
post_install_message:
|
142
132
|
rdoc_options: []
|
143
133
|
require_paths:
|
144
134
|
- lib
|
145
135
|
required_ruby_version: !ruby/object:Gem::Requirement
|
146
|
-
none: false
|
147
136
|
requirements:
|
148
|
-
- -
|
137
|
+
- - '>='
|
149
138
|
- !ruby/object:Gem::Version
|
150
139
|
version: '0'
|
151
|
-
segments:
|
152
|
-
- 0
|
153
|
-
hash: -659530255
|
154
140
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
155
|
-
none: false
|
156
141
|
requirements:
|
157
|
-
- -
|
142
|
+
- - '>='
|
158
143
|
- !ruby/object:Gem::Version
|
159
144
|
version: '0'
|
160
145
|
requirements: []
|
161
146
|
rubyforge_project:
|
162
|
-
rubygems_version:
|
147
|
+
rubygems_version: 2.0.3
|
163
148
|
signing_key:
|
164
|
-
specification_version:
|
149
|
+
specification_version: 4
|
165
150
|
summary: divvy up spectra from DTASelect files in a parsimonious way
|
166
151
|
test_files: []
|