mspire 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/hash/inverse.rb +15 -0
- data/lib/mspire/error_rate/qvalue.rb +5 -5
- data/lib/mspire/fasta.rb +2 -0
- data/lib/mspire/ident/peptide/db/creator.rb +48 -58
- data/lib/mspire/ident/peptide/db/io.rb +5 -0
- data/lib/mspire/ident/peptide_hit/qvalue.rb +2 -2
- data/lib/mspire/ident/peptide_hit.rb +2 -2
- data/lib/mspire/ident/protein_group.rb +4 -2
- data/lib/mspire/isotope/aa.rb +10 -10
- data/lib/mspire/mzml/instrument_configuration.rb +10 -3
- data/lib/mspire/quant/cmdline.rb +42 -0
- data/lib/mspire/quant/protein_group_comparison.rb +29 -0
- data/lib/mspire/quant/spectral_counts.rb +42 -0
- data/script/fasta_to_peptide_centric_db.rb +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +37 -45
- data/script/mass_correct.rb +118 -0
- data/script/minimal_protein_set.rb +345 -0
- data/script/mzml_to_mgf.rb +46 -0
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +275 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +11 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +157 -157
- metadata +11 -2
@@ -0,0 +1,275 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'andand'
|
4
|
+
require 'set'
|
5
|
+
require 'ruport'
|
6
|
+
|
7
|
+
require 'mspire/ident/peptide_hit/qvalue'
|
8
|
+
require 'mspire/ident/peptide_hit'
|
9
|
+
require 'mspire/ident/protein_group'
|
10
|
+
require 'mspire/ident/protein'
|
11
|
+
require 'mspire/ident/peptide/db/io'
|
12
|
+
require 'mspire/quant/spectral_counts'
|
13
|
+
require 'mspire/quant/protein_group_comparison'
|
14
|
+
require 'mspire/quant/qspec/protein_group_comparison'
|
15
|
+
require 'mspire/quant/qspec'
|
16
|
+
require 'mspire/quant/cmdline'
|
17
|
+
require 'mspire/fasta'
|
18
|
+
|
19
|
+
|
20
|
+
require 'yaml'
|
21
|
+
require 'tempfile'
|
22
|
+
|
23
|
+
require 'trollop'
|
24
|
+
|
25
|
+
def putsv(*args)
|
26
|
+
if $VERBOSE
|
27
|
+
puts(*args) ; $stdout.flush
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def basename(file)
|
32
|
+
base = file.chomp(File.extname(file))
|
33
|
+
base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
|
34
|
+
base
|
35
|
+
end
|
36
|
+
|
37
|
+
class Ruport::Data::Table
|
38
|
+
# returns self
|
39
|
+
def add_column_with_data(colname, array_of_data, opts={})
|
40
|
+
self.add_column(colname, opts)
|
41
|
+
self.data.zip(array_of_data) do |row, newval|
|
42
|
+
row[colname] = newval
|
43
|
+
end
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
# acceptable opts:
|
48
|
+
#
|
49
|
+
# :header => an array of lines (each which will be commented out)
|
50
|
+
def to_tsv(file, opt={})
|
51
|
+
delimiter = "\t"
|
52
|
+
File.open(file,'w') do |out|
|
53
|
+
opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
|
54
|
+
out.puts self.column_names.join(delimiter)
|
55
|
+
self.data.each do |row|
|
56
|
+
out.puts row.to_a.join(delimiter)
|
57
|
+
end
|
58
|
+
opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def write_subset(sample_to_pephits, outfile="peptidecentric_subset.yml")
|
65
|
+
aaseqs_to_prots = {}
|
66
|
+
sample_to_pephits.map(&:last).flatten(1).each do |pephit|
|
67
|
+
aaseqs_to_prots[pephit.aaseq] = pephit.proteins.map(&:id)
|
68
|
+
end
|
69
|
+
File.open(outfile,'w') do |out|
|
70
|
+
aaseqs_to_prots.each do |k,v|
|
71
|
+
out.puts(%Q{#{k}: #{v.join("\t") }})
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
outfile = "spectral_counts.tsv"
|
78
|
+
pephits_outfile = "spectral_counts.pephits.tsv"
|
79
|
+
delimiter = "\t"
|
80
|
+
|
81
|
+
opts = Trollop::Parser.new do
|
82
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.phq.tsv,f2.phq.tsv group2=f3.phq.tsv,f4.phq.tsv
|
83
|
+
or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.phq.tsv file2.phq.tsv ...
|
84
|
+
|
85
|
+
writes to #{outfile}
|
86
|
+
group names can be arbitrarily defined
|
87
|
+
}
|
88
|
+
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
89
|
+
opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
|
90
|
+
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
91
|
+
opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
|
92
|
+
opt :outfile, "the to which file data are written", :default => outfile
|
93
|
+
opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
|
94
|
+
opt :verbose, "speak up", :default => false
|
95
|
+
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
96
|
+
opt :qspec_decibans, "report bayesfactor in decibans"
|
97
|
+
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
98
|
+
opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
|
99
|
+
opt :write_subset, "(dev use only) write subset db", :default => false
|
100
|
+
end
|
101
|
+
|
102
|
+
opt = opts.parse(ARGV)
|
103
|
+
opt[:count_type] = opt[:count_type].to_sym
|
104
|
+
|
105
|
+
$VERBOSE = opt.delete(:verbose)
|
106
|
+
|
107
|
+
if ARGV.size < 2
|
108
|
+
opts.educate && exit
|
109
|
+
end
|
110
|
+
|
111
|
+
if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
|
112
|
+
puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
|
113
|
+
opts.educate && exit
|
114
|
+
end
|
115
|
+
|
116
|
+
peptide_centric_db_file = ARGV.shift
|
117
|
+
raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
|
118
|
+
putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
119
|
+
|
120
|
+
# groupname => files
|
121
|
+
|
122
|
+
(samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
|
123
|
+
|
124
|
+
raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
|
125
|
+
|
126
|
+
samplenames = samplename_to_filename.keys
|
127
|
+
|
128
|
+
class Mspire::Ident::PeptideHit
|
129
|
+
attr_accessor :experiment_name
|
130
|
+
attr_accessor :protein_groups
|
131
|
+
end
|
132
|
+
|
133
|
+
class Mspire::Ident::Protein
|
134
|
+
attr_accessor :length
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
fdr_cutoff = opt[:fdr_percent] / 100
|
139
|
+
|
140
|
+
if opt[:qspec] || opt[:descriptions]
|
141
|
+
putsv "reading lengths and descriptions from #{opt[:fasta]}"
|
142
|
+
#Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
143
|
+
id_to_length = {}
|
144
|
+
id_to_desc = {}
|
145
|
+
Mspire::Fasta.foreach(opt[:fasta]) do |entry|
|
146
|
+
acc = entry.accession
|
147
|
+
id_to_length[acc] = entry.length
|
148
|
+
id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
samplename_to_peptidehits = samplename_to_filename.map do |sample, file|
|
153
|
+
[sample, Mspire::Ident::PeptideHit::Qvalue.from_file(file).select {|hit| hit.qvalue <= fdr_cutoff }]
|
154
|
+
end
|
155
|
+
|
156
|
+
# update each peptide hit with protein hits and sample name:
|
157
|
+
all_protein_hits = Hash.new {|h,id| h[id] = Mspire::Ident::Protein.new(id) }
|
158
|
+
|
159
|
+
Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
160
|
+
samplename_to_peptidehits.map do |sample, peptide_hits|
|
161
|
+
# removes pephits that aren't in the database, (usually ones with aa 'X'
|
162
|
+
# in them )
|
163
|
+
normal_pephits = peptide_hits.select {|hit| peptide_to_proteins[hit.aaseq] }
|
164
|
+
normal_pephits.each do |hit|
|
165
|
+
# update each peptide with its protein hits
|
166
|
+
protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
|
167
|
+
protein = all_protein_hits[id]
|
168
|
+
protein.length = id_to_length[id] if id_to_length
|
169
|
+
protein.description = id_to_desc[id] if id_to_desc
|
170
|
+
protein
|
171
|
+
end
|
172
|
+
hit.experiment_name = sample
|
173
|
+
# if there are protein hits, the peptide hit is selected
|
174
|
+
hit.proteins = protein_hits
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
write_subset(samplename_to_peptidehits) if opt[:write_subset]
|
180
|
+
|
181
|
+
samplename_to_peptidehits.each {|samplename, hits| putsv "#{samplename}: #{hits.size}" } if $VERBOSE
|
182
|
+
|
183
|
+
all_peptide_hits = samplename_to_peptidehits.map(&:last).flatten(1)
|
184
|
+
|
185
|
+
# this constricts everything down to a minimal set of protein groups that
|
186
|
+
# explain the entire set of peptide hits.
|
187
|
+
update_pephits = true # ensures that each pephit is linked to the array of protein groups it is associated with
|
188
|
+
protein_groups = Mspire::Ident::ProteinGroup.peptide_hits_to_protein_groups(all_peptide_hits, update_pephits)
|
189
|
+
|
190
|
+
hits_table_hash = {} # create the table using key => column hash
|
191
|
+
samplenames.each do |name|
|
192
|
+
hits_table_hash[name] = protein_groups.map do |prot_group|
|
193
|
+
prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# The columns are filled with groups of peptide hits, one group of hits per
|
198
|
+
# protein group (protein group order is implicit). The rows are sample names.
|
199
|
+
#
|
200
|
+
# (implied) sample1 sample2 sample3 ...
|
201
|
+
# (group1) [hit,hit] [hit...] [hit...] ...
|
202
|
+
# (group2) [hit,hit] [hit...] [hit...] ...
|
203
|
+
# ... ... ... ... ...
|
204
|
+
hits_table = Ruport::Data::Table.new(:data => hits_table_hash.values.transpose, :column_names => hits_table_hash.keys)
|
205
|
+
|
206
|
+
# spectral counts of type opt[:count_type]
|
207
|
+
counts_data = hits_table.data.map do |row|
|
208
|
+
row.map do |pephits|
|
209
|
+
Mspire::Quant::SpectralCounts.counts(pephits) {|pephit| 1.0 / pephit.protein_groups.size }.send(opt[:count_type])
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# each cell holds a SpectralCounts object, which hash 3 types of count data
|
214
|
+
counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
|
215
|
+
|
216
|
+
# return a list of ProteinGroupComparisons
|
217
|
+
if opt[:qspec]
|
218
|
+
|
219
|
+
# prepare data for qspec
|
220
|
+
condition_to_count_array = counts_table.column_names.map do |name|
|
221
|
+
[samplename_to_condition[name], counts_table.column(name)]
|
222
|
+
end
|
223
|
+
# average length of the proteins in the group
|
224
|
+
name_length_pairs = protein_groups.map do |pg|
|
225
|
+
[pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
|
226
|
+
end
|
227
|
+
|
228
|
+
qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
|
229
|
+
|
230
|
+
cols_to_add = [:bayes_factor, :fold_change, :fdr]
|
231
|
+
to_add_as_headers = cols_to_add.map do |v|
|
232
|
+
if opt[:qspec_decibans] && v == :bayes_factor
|
233
|
+
:decibans
|
234
|
+
else
|
235
|
+
v
|
236
|
+
end
|
237
|
+
end
|
238
|
+
counts_table.add_columns to_add_as_headers
|
239
|
+
counts_table.data.zip(qspec_results) do |row, qspec_result|
|
240
|
+
cols_to_add.each do |cat|
|
241
|
+
if cat == :bayes_factor && opt[:qspec_decibans]
|
242
|
+
row[:decibans] = 10 * Math.log10(qspec_result[cat])
|
243
|
+
else
|
244
|
+
row[cat] = qspec_result[cat]
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
counts_table.add_columns( [:name, :ids, :description] )
|
251
|
+
counts_table.data.zip(protein_groups) do |row, pg|
|
252
|
+
best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
|
253
|
+
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
254
|
+
row.ids = pg.map(&:id).join(',')
|
255
|
+
row.description = best_id.description
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
if opt[:peptides]
|
260
|
+
hits_table.each do |record|
|
261
|
+
record.each_with_index do |hits,i|
|
262
|
+
new_cell = hits.group_by do |hit|
|
263
|
+
[hit.aaseq, hit.charge]
|
264
|
+
end.map do |key, hits|
|
265
|
+
[key.reverse.join("_"), hits.map(&:id).join(',')].join(":")
|
266
|
+
end.join('; ')
|
267
|
+
record[i] = new_cell
|
268
|
+
end
|
269
|
+
end
|
270
|
+
hits_table.add_column_with_data(:name, counts_table.column(:name), :position=>0)
|
271
|
+
hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
|
272
|
+
end
|
273
|
+
|
274
|
+
intro = ["samples: #{samplename_to_filename}", "options: #{opt}"]
|
275
|
+
counts_table.to_tsv(outfile, :footer => intro)
|
@@ -49,6 +49,17 @@ describe 'creating a peptide centric database' do
|
|
49
49
|
#File.unlink(@output_file)
|
50
50
|
end
|
51
51
|
|
52
|
+
it 'can use a trie' do
|
53
|
+
Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file, '--trie'])
|
54
|
+
triefile = TESTFILES + '/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4'
|
55
|
+
%w(.trie .tail .da).each do |ext|
|
56
|
+
File.exist?(triefile + ext).should be_true
|
57
|
+
end
|
58
|
+
trie = Trie.read(triefile)
|
59
|
+
p trie.get('MADGSGWQPPRPCEAYR')
|
60
|
+
#trie.get('MADGSGWQPPRPCEAYR').should == ["D3DX18"]
|
61
|
+
end
|
62
|
+
|
52
63
|
it 'lists approved enzymes and exits' do
|
53
64
|
output = capture_stdout do
|
54
65
|
begin
|