mspire 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/hash/inverse.rb +15 -0
- data/lib/mspire/error_rate/qvalue.rb +5 -5
- data/lib/mspire/fasta.rb +2 -0
- data/lib/mspire/ident/peptide/db/creator.rb +48 -58
- data/lib/mspire/ident/peptide/db/io.rb +5 -0
- data/lib/mspire/ident/peptide_hit/qvalue.rb +2 -2
- data/lib/mspire/ident/peptide_hit.rb +2 -2
- data/lib/mspire/ident/protein_group.rb +4 -2
- data/lib/mspire/isotope/aa.rb +10 -10
- data/lib/mspire/mzml/instrument_configuration.rb +10 -3
- data/lib/mspire/quant/cmdline.rb +42 -0
- data/lib/mspire/quant/protein_group_comparison.rb +29 -0
- data/lib/mspire/quant/spectral_counts.rb +42 -0
- data/script/fasta_to_peptide_centric_db.rb +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +37 -45
- data/script/mass_correct.rb +118 -0
- data/script/minimal_protein_set.rb +345 -0
- data/script/mzml_to_mgf.rb +46 -0
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +275 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +11 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +157 -157
- metadata +11 -2
@@ -0,0 +1,275 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'andand'
|
4
|
+
require 'set'
|
5
|
+
require 'ruport'
|
6
|
+
|
7
|
+
require 'mspire/ident/peptide_hit/qvalue'
|
8
|
+
require 'mspire/ident/peptide_hit'
|
9
|
+
require 'mspire/ident/protein_group'
|
10
|
+
require 'mspire/ident/protein'
|
11
|
+
require 'mspire/ident/peptide/db/io'
|
12
|
+
require 'mspire/quant/spectral_counts'
|
13
|
+
require 'mspire/quant/protein_group_comparison'
|
14
|
+
require 'mspire/quant/qspec/protein_group_comparison'
|
15
|
+
require 'mspire/quant/qspec'
|
16
|
+
require 'mspire/quant/cmdline'
|
17
|
+
require 'mspire/fasta'
|
18
|
+
|
19
|
+
|
20
|
+
require 'yaml'
|
21
|
+
require 'tempfile'
|
22
|
+
|
23
|
+
require 'trollop'
|
24
|
+
|
25
|
+
def putsv(*args)
|
26
|
+
if $VERBOSE
|
27
|
+
puts(*args) ; $stdout.flush
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def basename(file)
|
32
|
+
base = file.chomp(File.extname(file))
|
33
|
+
base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
|
34
|
+
base
|
35
|
+
end
|
36
|
+
|
37
|
+
class Ruport::Data::Table
|
38
|
+
# returns self
|
39
|
+
def add_column_with_data(colname, array_of_data, opts={})
|
40
|
+
self.add_column(colname, opts)
|
41
|
+
self.data.zip(array_of_data) do |row, newval|
|
42
|
+
row[colname] = newval
|
43
|
+
end
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
# acceptable opts:
|
48
|
+
#
|
49
|
+
# :header => an array of lines (each which will be commented out)
|
50
|
+
def to_tsv(file, opt={})
|
51
|
+
delimiter = "\t"
|
52
|
+
File.open(file,'w') do |out|
|
53
|
+
opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
|
54
|
+
out.puts self.column_names.join(delimiter)
|
55
|
+
self.data.each do |row|
|
56
|
+
out.puts row.to_a.join(delimiter)
|
57
|
+
end
|
58
|
+
opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def write_subset(sample_to_pephits, outfile="peptidecentric_subset.yml")
|
65
|
+
aaseqs_to_prots = {}
|
66
|
+
sample_to_pephits.map(&:last).flatten(1).each do |pephit|
|
67
|
+
aaseqs_to_prots[pephit.aaseq] = pephit.proteins.map(&:id)
|
68
|
+
end
|
69
|
+
File.open(outfile,'w') do |out|
|
70
|
+
aaseqs_to_prots.each do |k,v|
|
71
|
+
out.puts(%Q{#{k}: #{v.join("\t") }})
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
outfile = "spectral_counts.tsv"
|
78
|
+
pephits_outfile = "spectral_counts.pephits.tsv"
|
79
|
+
delimiter = "\t"
|
80
|
+
|
81
|
+
opts = Trollop::Parser.new do
|
82
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.phq.tsv,f2.phq.tsv group2=f3.phq.tsv,f4.phq.tsv
|
83
|
+
or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.phq.tsv file2.phq.tsv ...
|
84
|
+
|
85
|
+
writes to #{outfile}
|
86
|
+
group names can be arbitrarily defined
|
87
|
+
}
|
88
|
+
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
89
|
+
opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
|
90
|
+
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
91
|
+
opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
|
92
|
+
opt :outfile, "the to which file data are written", :default => outfile
|
93
|
+
opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
|
94
|
+
opt :verbose, "speak up", :default => false
|
95
|
+
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
96
|
+
opt :qspec_decibans, "report bayesfactor in decibans"
|
97
|
+
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
98
|
+
opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
|
99
|
+
opt :write_subset, "(dev use only) write subset db", :default => false
|
100
|
+
end
|
101
|
+
|
102
|
+
opt = opts.parse(ARGV)
|
103
|
+
opt[:count_type] = opt[:count_type].to_sym
|
104
|
+
|
105
|
+
$VERBOSE = opt.delete(:verbose)
|
106
|
+
|
107
|
+
if ARGV.size < 2
|
108
|
+
opts.educate && exit
|
109
|
+
end
|
110
|
+
|
111
|
+
if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
|
112
|
+
puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
|
113
|
+
opts.educate && exit
|
114
|
+
end
|
115
|
+
|
116
|
+
peptide_centric_db_file = ARGV.shift
|
117
|
+
raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
|
118
|
+
putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
119
|
+
|
120
|
+
# groupname => files
|
121
|
+
|
122
|
+
(samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
|
123
|
+
|
124
|
+
raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
|
125
|
+
|
126
|
+
samplenames = samplename_to_filename.keys
|
127
|
+
|
128
|
+
class Mspire::Ident::PeptideHit
|
129
|
+
attr_accessor :experiment_name
|
130
|
+
attr_accessor :protein_groups
|
131
|
+
end
|
132
|
+
|
133
|
+
class Mspire::Ident::Protein
|
134
|
+
attr_accessor :length
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
fdr_cutoff = opt[:fdr_percent] / 100
|
139
|
+
|
140
|
+
if opt[:qspec] || opt[:descriptions]
|
141
|
+
putsv "reading lengths and descriptions from #{opt[:fasta]}"
|
142
|
+
#Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
143
|
+
id_to_length = {}
|
144
|
+
id_to_desc = {}
|
145
|
+
Mspire::Fasta.foreach(opt[:fasta]) do |entry|
|
146
|
+
acc = entry.accession
|
147
|
+
id_to_length[acc] = entry.length
|
148
|
+
id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
samplename_to_peptidehits = samplename_to_filename.map do |sample, file|
|
153
|
+
[sample, Mspire::Ident::PeptideHit::Qvalue.from_file(file).select {|hit| hit.qvalue <= fdr_cutoff }]
|
154
|
+
end
|
155
|
+
|
156
|
+
# update each peptide hit with protein hits and sample name:
|
157
|
+
all_protein_hits = Hash.new {|h,id| h[id] = Mspire::Ident::Protein.new(id) }
|
158
|
+
|
159
|
+
Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
160
|
+
samplename_to_peptidehits.map do |sample, peptide_hits|
|
161
|
+
# removes pephits that aren't in the database, (usually ones with aa 'X'
|
162
|
+
# in them )
|
163
|
+
normal_pephits = peptide_hits.select {|hit| peptide_to_proteins[hit.aaseq] }
|
164
|
+
normal_pephits.each do |hit|
|
165
|
+
# update each peptide with its protein hits
|
166
|
+
protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
|
167
|
+
protein = all_protein_hits[id]
|
168
|
+
protein.length = id_to_length[id] if id_to_length
|
169
|
+
protein.description = id_to_desc[id] if id_to_desc
|
170
|
+
protein
|
171
|
+
end
|
172
|
+
hit.experiment_name = sample
|
173
|
+
# if there are protein hits, the peptide hit is selected
|
174
|
+
hit.proteins = protein_hits
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
write_subset(samplename_to_peptidehits) if opt[:write_subset]
|
180
|
+
|
181
|
+
samplename_to_peptidehits.each {|samplename, hits| putsv "#{samplename}: #{hits.size}" } if $VERBOSE
|
182
|
+
|
183
|
+
all_peptide_hits = samplename_to_peptidehits.map(&:last).flatten(1)
|
184
|
+
|
185
|
+
# this constricts everything down to a minimal set of protein groups that
|
186
|
+
# explain the entire set of peptide hits.
|
187
|
+
update_pephits = true # ensures that each pephit is linked to the array of protein groups it is associated with
|
188
|
+
protein_groups = Mspire::Ident::ProteinGroup.peptide_hits_to_protein_groups(all_peptide_hits, update_pephits)
|
189
|
+
|
190
|
+
hits_table_hash = {} # create the table using key => column hash
|
191
|
+
samplenames.each do |name|
|
192
|
+
hits_table_hash[name] = protein_groups.map do |prot_group|
|
193
|
+
prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# The columns are filled with groups of peptide hits, one group of hits per
|
198
|
+
# protein group (protein group order is implicit). The rows are sample names.
|
199
|
+
#
|
200
|
+
# (implied) sample1 sample2 sample3 ...
|
201
|
+
# (group1) [hit,hit] [hit...] [hit...] ...
|
202
|
+
# (group2) [hit,hit] [hit...] [hit...] ...
|
203
|
+
# ... ... ... ... ...
|
204
|
+
hits_table = Ruport::Data::Table.new(:data => hits_table_hash.values.transpose, :column_names => hits_table_hash.keys)
|
205
|
+
|
206
|
+
# spectral counts of type opt[:count_type]
|
207
|
+
counts_data = hits_table.data.map do |row|
|
208
|
+
row.map do |pephits|
|
209
|
+
Mspire::Quant::SpectralCounts.counts(pephits) {|pephit| 1.0 / pephit.protein_groups.size }.send(opt[:count_type])
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# each cell holds a SpectralCounts object, which hash 3 types of count data
|
214
|
+
counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
|
215
|
+
|
216
|
+
# return a list of ProteinGroupComparisons
|
217
|
+
if opt[:qspec]
|
218
|
+
|
219
|
+
# prepare data for qspec
|
220
|
+
condition_to_count_array = counts_table.column_names.map do |name|
|
221
|
+
[samplename_to_condition[name], counts_table.column(name)]
|
222
|
+
end
|
223
|
+
# average length of the proteins in the group
|
224
|
+
name_length_pairs = protein_groups.map do |pg|
|
225
|
+
[pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
|
226
|
+
end
|
227
|
+
|
228
|
+
qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
|
229
|
+
|
230
|
+
cols_to_add = [:bayes_factor, :fold_change, :fdr]
|
231
|
+
to_add_as_headers = cols_to_add.map do |v|
|
232
|
+
if opt[:qspec_decibans] && v == :bayes_factor
|
233
|
+
:decibans
|
234
|
+
else
|
235
|
+
v
|
236
|
+
end
|
237
|
+
end
|
238
|
+
counts_table.add_columns to_add_as_headers
|
239
|
+
counts_table.data.zip(qspec_results) do |row, qspec_result|
|
240
|
+
cols_to_add.each do |cat|
|
241
|
+
if cat == :bayes_factor && opt[:qspec_decibans]
|
242
|
+
row[:decibans] = 10 * Math.log10(qspec_result[cat])
|
243
|
+
else
|
244
|
+
row[cat] = qspec_result[cat]
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
counts_table.add_columns( [:name, :ids, :description] )
|
251
|
+
counts_table.data.zip(protein_groups) do |row, pg|
|
252
|
+
best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
|
253
|
+
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
254
|
+
row.ids = pg.map(&:id).join(',')
|
255
|
+
row.description = best_id.description
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
if opt[:peptides]
|
260
|
+
hits_table.each do |record|
|
261
|
+
record.each_with_index do |hits,i|
|
262
|
+
new_cell = hits.group_by do |hit|
|
263
|
+
[hit.aaseq, hit.charge]
|
264
|
+
end.map do |key, hits|
|
265
|
+
[key.reverse.join("_"), hits.map(&:id).join(',')].join(":")
|
266
|
+
end.join('; ')
|
267
|
+
record[i] = new_cell
|
268
|
+
end
|
269
|
+
end
|
270
|
+
hits_table.add_column_with_data(:name, counts_table.column(:name), :position=>0)
|
271
|
+
hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
|
272
|
+
end
|
273
|
+
|
274
|
+
intro = ["samples: #{samplename_to_filename}", "options: #{opt}"]
|
275
|
+
counts_table.to_tsv(outfile, :footer => intro)
|
@@ -49,6 +49,17 @@ describe 'creating a peptide centric database' do
|
|
49
49
|
#File.unlink(@output_file)
|
50
50
|
end
|
51
51
|
|
52
|
+
it 'can use a trie' do
|
53
|
+
Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file, '--trie'])
|
54
|
+
triefile = TESTFILES + '/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4'
|
55
|
+
%w(.trie .tail .da).each do |ext|
|
56
|
+
File.exist?(triefile + ext).should be_true
|
57
|
+
end
|
58
|
+
trie = Trie.read(triefile)
|
59
|
+
p trie.get('MADGSGWQPPRPCEAYR')
|
60
|
+
#trie.get('MADGSGWQPPRPCEAYR').should == ["D3DX18"]
|
61
|
+
end
|
62
|
+
|
52
63
|
it 'lists approved enzymes and exits' do
|
53
64
|
output = capture_stdout do
|
54
65
|
begin
|