mspire 0.8.5 → 0.8.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rserve/simpler/R'
4
+ require 'runarray/narray'
5
+
6
+ MzDiffs = Struct.new(:mz, :intensity, :spectrum_id, :dev) do
7
+ def abs_dev
8
+ self.dev.abs
9
+ end
10
+ end
11
+
12
+ # returns an array of spectrum_id => shift
13
+ def find_spectral_shifts(mz_theor, mz_diffs, dev_cutoff = 0.5)
14
+ spec_id_to_shift = {}
15
+
16
+ (close_diffs, far_diffs) = mz_diffs.partition {|diff| diff.abs_dev < dev_cutoff }
17
+
18
+ close_mz_vals = close_diffs.map(&:mz)
19
+
20
+ runarray = Runarray::NArray.new(close_mz_vals)
21
+ outlier_indices = runarray.outliers_iteratively(3)
22
+
23
+ # need the global shift
24
+ tight_mz_vals = close_mz_vals.reject.with_index do |mz, i|
25
+ outlier_indices.include?(i)
26
+ end
27
+
28
+ (mean, sd) = Runarray::NArray.new(tight_mz_vals).sample_stats
29
+
30
+ global_shift = mean - mz_theor
31
+
32
+ close_diffs.zip(close_mz_vals).each.with_index do |(mz_diff, mz_val),i|
33
+ spec_id_to_shift[mz_diff.spectrum_id] =
34
+ if outlier_indices.include?(i)
35
+ global_shift
36
+ else
37
+ global_shift + (mz_val - mean)
38
+ end
39
+ end
40
+
41
+ far_diffs.each {|mz_diff| spec_id_to_shift[mz_diff.spectrum_id] = global_shift }
42
+
43
+ #pvalue = R.converse( mz_diffs: close_mz_vals ) do
44
+ # "shapiro.test(mz_diffs)$p.value"
45
+ #end
46
+ spec_id_to_shift
47
+ end
48
+
49
+ require 'optparse'
50
+ require 'mspire/mzml'
51
+ ext = ".massCorrected.mzML"
52
+ opt = {}
53
+ opts = OptionParser.new do |op|
54
+ op.banner = "usage: #{File.basename($0)} [OPTS] <m/z> <file>.mzML ..."
55
+ op.separator "output: <file>#{ext}"
56
+ op.separator "finds the nearest m/z to <m/z> and shifts m/z values"
57
+ op.separator "prints the corrected deviation to stdout"
58
+ op.separator ""
59
+ op.separator "options:"
60
+ op.on("-t", "--threshold <Float>", Float, 'intensity must be above threshold') {|v| opt[:threshold] = v }
61
+ op.on("-f", "--filter-string-regex <regex-no-slashes>", 'only match and calibrate if matches filter string') {|v| opt[:filter_string_regex] = Regexp.new(Regexp.escape(v)) }
62
+ end
63
+ opts.parse!
64
+
65
+ if ARGV.size == 0
66
+ puts opts
67
+ exit
68
+ end
69
+
70
+ threshold = opt[:threshold] || 0.0
71
+ filter_string_regex = opt[:filter_string_regex]
72
+
73
+ mz_theor = ARGV.shift.to_f
74
+
75
+ ARGV.each do |file|
76
+ base = file.chomp(File.extname(file))
77
+ outfile = base + ext
78
+
79
+ mz_diffs = []
80
+ Mspire::Mzml.open(file) do |mzml|
81
+ #Finding the deviation
82
+ mzml.each do |spectrum|
83
+ if spectrum.ms_level == 1
84
+ if filter_string_regex
85
+ next unless filter_string_regex.match(spectrum.scan_list.first.fetch_by_acc('MS:1000512'))
86
+ end
87
+ indices = spectrum.find_all_nearest_index(mz_theor)
88
+ best_index = indices.max {|i| spectrum.intensities[i] }
89
+ closest_mz = spectrum.mzs[best_index]
90
+ mz_diffs << MzDiffs.new(closest_mz, spectrum.intensities[best_index], spectrum.id, closest_mz - mz_theor)
91
+ end
92
+ end
93
+
94
+ spectral_shifts = find_spectral_shifts(mz_theor, mz_diffs)
95
+
96
+ #correcting the masses
97
+ spectra = mzml.map do |spectrum|
98
+ if spectrum.ms_level == 1
99
+ spectrum.mzs.map! do|mz|
100
+ if (shift=spectral_shifts[spectrum.id])
101
+ mz + shift
102
+ else
103
+ mz
104
+ end
105
+ end
106
+ spectrum
107
+ else
108
+ spectrum
109
+ end
110
+ end
111
+
112
+ data_processing = Mspire::Mzml::DataProcessing.new("Corrected_Mass")
113
+ mzml.data_processing_list << data_processing
114
+ mzml.run.spectrum_list = Mspire::Mzml::SpectrumList.new(data_processing, spectra)
115
+ mzml.write(outfile)
116
+ end
117
+ end
118
+
@@ -0,0 +1,345 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+ require 'optparse'
6
+ require 'mspire/fasta'
7
+ require 'mspire/ident/peptide/db/io'
8
+
9
+ SET_RE = /Set\s+(.*)/i
10
+ QVALUE_EXT = ".phq.tsv"
11
+
12
+ # returns [sets_to_paths_hash, sets_order]
13
+ def sets_compare_to_paths(file, ext=QVALUE_EXT)
14
+ dirname = File.dirname(File.expand_path(file))
15
+ lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
16
+ sets = {}
17
+ current_set = nil
18
+ sets_order = []
19
+ lines.each do |line|
20
+ if line =~ SET_RE
21
+ current_set = $1.dup
22
+ sets[current_set] = []
23
+ sets_order << current_set
24
+ else
25
+ full_path = (File.join(dirname,(line + ext)))
26
+ raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
27
+ sets[current_set] << full_path
28
+ end
29
+ end
30
+ [sets, sets_order]
31
+ end
32
+
33
+ # returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
34
+ # takes a hash of proteins to aaseqs. Uses a greedy algorithm where
35
+ # things are sorted first by the number of uniq amino acid sequences and total
36
+ # aa length. if a block is given, then will yield the prot and the
37
+ # peptide_array and sort by the returned value. The greedy algorithm acts on
38
+ # the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
39
+ # on the proteins in the minimal_protein_array and gives an array of other
40
+ # proteins.
41
+ def minimal_protein_set(proteins_to_aaseqs)
42
+ blk_given = block_given?
43
+ #STDERR.puts "using block for minimal_protein_set" if blk_given
44
+ proteins_and_uniq_peps = []
45
+
46
+ sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
47
+ if blk_given
48
+ yield(k,v)
49
+ else
50
+ [ v.size, v.inject(0){|m,s| m+s.size} ]
51
+ end
52
+ end.reverse
53
+
54
+ found_seq = Set.new
55
+
56
+ same_peptide_hits = {}
57
+
58
+ last_peps = nil
59
+ last_uniq_prot = nil
60
+ sorted_most_to_least.each do |prot, peps|
61
+ sorted_peps = peps.sort # is it necessary to SORT?????????
62
+ uniq_peps = peps.select do |pep|
63
+ if found_seq.include?(pep)
64
+ false
65
+ else
66
+ found_seq.add pep
67
+ true
68
+ end
69
+ end
70
+ if uniq_peps.size > 0
71
+ proteins_and_uniq_peps << [prot, uniq_peps]
72
+ same_peptide_hits[prot] = []
73
+ last_peps = sorted_peps
74
+ last_uniq_prot = prot
75
+ else
76
+ if sorted_peps == last_peps
77
+ same_peptide_hits[last_uniq_prot] << prot
78
+ end
79
+ end
80
+ end
81
+ prot_to_uniq_peps_hash = {}
82
+ proteins_and_uniq_peps.each do |prot, uniq_peps|
83
+ prot_to_uniq_peps_hash[prot] = uniq_peps
84
+ end
85
+
86
+ [prot_to_uniq_peps_hash, same_peptide_hits]
87
+ end
88
+
89
+ def cutoffs_to_floats(ar)
90
+ ar.map do |v|
91
+ if v == 'nil' || v == '-'
92
+ nil
93
+ else
94
+ answ = v.to_f
95
+ end
96
+ end
97
+ end
98
+
99
+ # returns a hash keyed on protein id that yields an array:
100
+ # [#aaseq, #aaseq_and_charge, #total_hits]
101
+ def stats_per_prot(prot_to_peps, seq_to_hits)
102
+ per_protein_hash = {}
103
+ prot_to_peps.each do |prot, uniq_pep_seqs|
104
+ all = Set.new
105
+ aaseqcharges = Set.new
106
+ aaseqs = Set.new
107
+
108
+ uniq_pep_seqs.each do |pep_seq|
109
+ all_hits = seq_to_hits[pep_seq]
110
+ all.merge( all_hits )
111
+ all_hits.each do |hit|
112
+ aaseq = hit.sequence
113
+ aaseqs.add( aaseq )
114
+ aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
115
+ end
116
+ per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
117
+
118
+ end
119
+ end
120
+ per_protein_hash
121
+ end
122
+
123
+ opt = {
124
+ :cutoffs => [nil],
125
+ :outfile => "summary.yml",
126
+ }
127
+
128
+ opts = OptionParser.new do |op|
129
+ op.banner = "usage: #{File.basename(__FILE__)} pepcentric_db.yml sets_compare.txt"
130
+ op.separator "output: #{opt[:outfile]}"
131
+ op.separator ""
132
+ op.separator "input: "
133
+ op.separator " each <file> referenced in sets_compare.txt should have a"
134
+ op.separator " <file>.phq.tsv file"
135
+ op.separator ""
136
+ op.separator "options:"
137
+ op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
138
+ op.separator ""
139
+ op.separator "formats:"
140
+ op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
141
+ op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
142
+ op.on("--pepcentric-db-format", "prints peptide centric db format and exits") {|v| opt[:pepcentric_db_format] = v }
143
+ end
144
+
145
+ # later on we could implement full isoform resolution like IsoformResolver
146
+ # for now we will generate a report, realizing that some isoforms may not be
147
+ # reported
148
+ # it is implemented by using a pre-made map from sequence to protein groups
149
+ # then, a set of sequences allows one to deduce all the relationships from the
150
+ # protein groups.
151
+
152
+ opts.parse!
153
+
154
+ pd = Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER
155
+ kvd = Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER
156
+
157
+ if opt[:pepcentric_db_format]
158
+ puts "pepcentric_db.yml needs to be in the format:"
159
+ puts "<PEPTIDE>#{kvd.inspect}<ID>#{pd.inspect}<ID>#{pd.inspect}<ID>"
160
+ puts "(The delimiters are shown with #inspect)"
161
+ end
162
+
163
+ if opt[:output_format]
164
+ yaml = <<SKEL
165
+ results:
166
+ - qvalue_cutoff: <Float>
167
+ sets:
168
+ <set_name>:
169
+ num_uniq_aaseqs: <Integer>
170
+ num_aaseqs_not_in_pep_db: <Integer>
171
+ num_uniq_aaseqs_charge: <Integer>
172
+ proteins:
173
+ <protein_id>:
174
+ num_hits_all:
175
+ - <Integer> # total num aaseqs
176
+ - <Integer> # total num aaseq+charge "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
177
+ op.on("--pepcentric-db-
178
+ - <Integer> # total num hits
179
+ num_hits_minimal:
180
+ - <Integer> # total num aaseqs
181
+ - <Integer> # total num aaseq+charge
182
+ - <Integer> # total num hits
183
+ indistinguishable:
184
+ - <protein_id>
185
+ - <protein_id>
186
+ aaseqs:
187
+ - <String>
188
+ - <String>
189
+ sets_order:
190
+ - <String>
191
+ - <String>
192
+ SKEL
193
+ print yaml
194
+ end
195
+
196
+ if opt[:input_format]
197
+ string =<<EXPLANATION
198
+ # the sets_compare.txt format is very simple:
199
+
200
+ Set <some_name_for_set1>
201
+ filename1_no_ext
202
+ filename2_no_ext
203
+ Set <some_name_for_set2>
204
+ filename3_no_ext
205
+ filename4_no_ext
206
+ ...
207
+ EXPLANATION
208
+ puts string
209
+ end
210
+
211
+ exit if opt.keys.any? {|key| key.to_s =~ /_format/ }
212
+
213
+ if ARGV.size != 2
214
+ p opts
215
+ puts opts.to_s
216
+ exit
217
+ end
218
+
219
+ (pepcentric_fn, sets_compare_fn) = ARGV
220
+
221
+ results = {}
222
+
223
+ results['results'] = []
224
+
225
+ (sets_hash, sets_order) = sets_compare_to_paths(sets_compare_fn)
226
+ results['sets_order'] = sets_order
227
+
228
+ STDERR.print "Loading peptide centric DB (this takes about a minute)..."
229
+ start = Time.now
230
+ Mspire::Ident::Peptide::Db::IO.open(pepcentric_fn) do |pep_to_prots|
231
+ STDERR.puts "#{Time.now - start} seconds."
232
+
233
+ opt[:cutoffs].each do |cutoff|
234
+
235
+ cutoff_results = {'qvalue_cutoff' => cutoff}
236
+ results_sets_hash = {}
237
+ cutoff_results['sets'] = results_sets_hash
238
+ results['results'] << cutoff_results
239
+
240
+ #########################
241
+ # FOR EACH SET:
242
+ #########################
243
+ pep_klass = nil
244
+ sets_hash.each do |set, files|
245
+ set_results = {}
246
+ results_sets_hash[set] = set_results
247
+
248
+ # assumes the indices are the same into each data file
249
+
250
+ # get the complete set of passing hits
251
+ all_passing_hits = files.inject([]) do |all_passing_hits, file|
252
+ hash = YAML.load_file(file)
253
+
254
+ header_hash = hash['headers']
255
+ pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
256
+ hits = hash['data'].map {|v| pep_klass.new(*v) }
257
+
258
+ passing_hits =
259
+ if cutoff
260
+ # assumes monotonic qvalues values!
261
+ (above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
262
+ above
263
+ else
264
+ hits
265
+ end
266
+ all_passing_hits.push(*passing_hits)
267
+ end
268
+
269
+
270
+ # create an index from aaseq to hits
271
+ seq_to_hits = Hash.new {|h,k| h[k] = []}
272
+ uniq_seqcharge = Set.new
273
+ all_passing_hits.each do |hit|
274
+ seq_to_hits[hit.sequence] << hit
275
+ uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
276
+ end
277
+
278
+
279
+ # determine the number of uniq aaseqs
280
+ uniq_seqs = seq_to_hits.size
281
+
282
+ num_uniq_seqcharges = uniq_seqcharge.size
283
+
284
+ set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
285
+ 'num_uniq_aaseqs' => uniq_seqs,
286
+ 'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
287
+ })
288
+
289
+ # create an index from proteins to peptides
290
+ prots_to_peps = Hash.new {|h,k| h[k] = [] }
291
+ peptides_not_found = []
292
+ seq_to_hits.keys.each do |seq|
293
+ if pep_db.key?(seq)
294
+ pep_db[seq].each do |prot|
295
+ prots_to_peps[prot] << seq
296
+ end
297
+ else
298
+ peptides_not_found << seq
299
+ end
300
+ end
301
+
302
+ # Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
303
+ stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
304
+
305
+ # get the minimal protein set
306
+ (prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
307
+ # will sort with lowest
308
+ [ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
309
+ end
310
+
311
+ prot_to_uniq_peps_hash.each do |prot, peps|
312
+ [prot, *indistinguishable_protein_hash[prot]].each do |prot|
313
+ protein_info[prot] = prot_header_hash[prot]
314
+ end
315
+ end
316
+
317
+ stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
318
+
319
+ # create a hash of data for each protein
320
+ protein_data_hashes_hash = {}
321
+ prot_to_uniq_peps_hash.each do |prot, peps|
322
+ protein_data_hashes_hash[prot] = {
323
+ 'aaseqs' => peps,
324
+ # this will be a triplet
325
+ 'num_hits_minimal' => stats_per_protein_minimal[prot],
326
+ 'indistinguishable' => indistinguishable_protein_hash[prot],
327
+ 'num_hits_all' => stats_per_protein_before[prot],
328
+ }
329
+ end
330
+
331
+ set_results['proteins'] = protein_data_hashes_hash
332
+ set_results['num_proteins'] = prot_to_uniq_peps_hash.size
333
+ set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
334
+ if peptides_not_found.size > 0
335
+ warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
336
+ end
337
+ end
338
+ end
339
+
340
+ File.open(opt[:outfile], 'w') do |out|
341
+ out.print results.to_yaml
342
+ end
343
+
344
+ end
345
+
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mspire/mzml'
4
+ require 'optparse'
5
+
6
+ opt = {
7
+ filter_zero_intensity: true,
8
+ retention_times: true,
9
+ }
10
+ opts = OptionParser.new do |op|
11
+ op.banner = "usage: #{File.basename($0)} <file>.mzML ..."
12
+ op.separator "outputs: <file>.mgf"
13
+ #op.on("--no-filter-zeros", "won't remove values with zero intensity") {|v| opt[:filter_zero_intensity] = false }
14
+ # the default is set in ms/msrun/search.rb -> set_opts
15
+ op.on("--no-retention-times", "won't include RT even if available") {|v| opt[:retention_times] = false }
16
+ end
17
+
18
+ opts.parse!
19
+
20
+ if ARGV.size == 0
21
+ puts opts
22
+ exit
23
+ end
24
+
25
+ ARGV.each do |file|
26
+ if File.exist?(file)
27
+ Mspire::Mzml.foreach(file).with_index do |spectrum,i|
28
+ next unless spectrum.ms_level > 1
29
+ puts "BEGIN IONS"
30
+ # id, spectrumid,
31
+ rt = spectrum.retention_time
32
+ title = [i, "id_#{spectrum.id}", "rt_#{rt.round}"].join('.')
33
+ puts "TITLE=#{title}"
34
+ puts "RTINSECONDS=#{rt}" if opt[:retention_times]
35
+ puts "PEPMASS=#{spectrum.precursor_mz}"
36
+ puts "CHARGE=#{spectrum.precursor_charge}+"
37
+ spectrum.each do |mz,int|
38
+ puts [mz, int].join(" ")
39
+ end
40
+ puts "END IONS"
41
+ puts ""
42
+ end
43
+ else
44
+ puts "missing file: #{file} [skipping]"
45
+ end
46
+ end