mspire 0.8.5 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rserve/simpler/R'
4
+ require 'runarray/narray'
5
+
6
+ MzDiffs = Struct.new(:mz, :intensity, :spectrum_id, :dev) do
7
+ def abs_dev
8
+ self.dev.abs
9
+ end
10
+ end
11
+
12
+ # returns an array of spectrum_id => shift
13
+ def find_spectral_shifts(mz_theor, mz_diffs, dev_cutoff = 0.5)
14
+ spec_id_to_shift = {}
15
+
16
+ (close_diffs, far_diffs) = mz_diffs.partition {|diff| diff.abs_dev < dev_cutoff }
17
+
18
+ close_mz_vals = close_diffs.map(&:mz)
19
+
20
+ runarray = Runarray::NArray.new(close_mz_vals)
21
+ outlier_indices = runarray.outliers_iteratively(3)
22
+
23
+ # need the global shift
24
+ tight_mz_vals = close_mz_vals.reject.with_index do |mz, i|
25
+ outlier_indices.include?(i)
26
+ end
27
+
28
+ (mean, sd) = Runarray::NArray.new(tight_mz_vals).sample_stats
29
+
30
+ global_shift = mean - mz_theor
31
+
32
+ close_diffs.zip(close_mz_vals).each.with_index do |(mz_diff, mz_val),i|
33
+ spec_id_to_shift[mz_diff.spectrum_id] =
34
+ if outlier_indices.include?(i)
35
+ global_shift
36
+ else
37
+ global_shift + (mz_val - mean)
38
+ end
39
+ end
40
+
41
+ far_diffs.each {|mz_diff| spec_id_to_shift[mz_diff.spectrum_id] = global_shift }
42
+
43
+ #pvalue = R.converse( mz_diffs: close_mz_vals ) do
44
+ # "shapiro.test(mz_diffs)$p.value"
45
+ #end
46
+ spec_id_to_shift
47
+ end
48
+
49
+ require 'optparse'
50
+ require 'mspire/mzml'
51
+ ext = ".massCorrected.mzML"
52
+ opt = {}
53
+ opts = OptionParser.new do |op|
54
+ op.banner = "usage: #{File.basename($0)} [OPTS] <m/z> <file>.mzML ..."
55
+ op.separator "output: <file>#{ext}"
56
+ op.separator "finds the nearest m/z to <m/z> and shifts m/z values"
57
+ op.separator "prints the corrected deviation to stdout"
58
+ op.separator ""
59
+ op.separator "options:"
60
+ op.on("-t", "--threshold <Float>", Float, 'intensity must be above threshold') {|v| opt[:threshold] = v }
61
+ op.on("-f", "--filter-string-regex <regex-no-slashes>", 'only match and calibrate if matches filter string') {|v| opt[:filter_string_regex] = Regexp.new(Regexp.escape(v)) }
62
+ end
63
+ opts.parse!
64
+
65
+ if ARGV.size == 0
66
+ puts opts
67
+ exit
68
+ end
69
+
70
+ threshold = opt[:threshold] || 0.0
71
+ filter_string_regex = opt[:filter_string_regex]
72
+
73
+ mz_theor = ARGV.shift.to_f
74
+
75
+ ARGV.each do |file|
76
+ base = file.chomp(File.extname(file))
77
+ outfile = base + ext
78
+
79
+ mz_diffs = []
80
+ Mspire::Mzml.open(file) do |mzml|
81
+ #Finding the deviation
82
+ mzml.each do |spectrum|
83
+ if spectrum.ms_level == 1
84
+ if filter_string_regex
85
+ next unless filter_string_regex.match(spectrum.scan_list.first.fetch_by_acc('MS:1000512'))
86
+ end
87
+ indices = spectrum.find_all_nearest_index(mz_theor)
88
+ best_index = indices.max {|i| spectrum.intensities[i] }
89
+ closest_mz = spectrum.mzs[best_index]
90
+ mz_diffs << MzDiffs.new(closest_mz, spectrum.intensities[best_index], spectrum.id, closest_mz - mz_theor)
91
+ end
92
+ end
93
+
94
+ spectral_shifts = find_spectral_shifts(mz_theor, mz_diffs)
95
+
96
+ #correcting the masses
97
+ spectra = mzml.map do |spectrum|
98
+ if spectrum.ms_level == 1
99
+ spectrum.mzs.map! do|mz|
100
+ if (shift=spectral_shifts[spectrum.id])
101
+ mz + shift
102
+ else
103
+ mz
104
+ end
105
+ end
106
+ spectrum
107
+ else
108
+ spectrum
109
+ end
110
+ end
111
+
112
+ data_processing = Mspire::Mzml::DataProcessing.new("Corrected_Mass")
113
+ mzml.data_processing_list << data_processing
114
+ mzml.run.spectrum_list = Mspire::Mzml::SpectrumList.new(data_processing, spectra)
115
+ mzml.write(outfile)
116
+ end
117
+ end
118
+
@@ -0,0 +1,345 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+ require 'optparse'
6
+ require 'mspire/fasta'
7
+ require 'mspire/ident/peptide/db/io'
8
+
9
+ SET_RE = /Set\s+(.*)/i
10
+ QVALUE_EXT = ".phq.tsv"
11
+
12
+ # returns [sets_to_paths_hash, sets_order]
13
+ def sets_compare_to_paths(file, ext=QVALUE_EXT)
14
+ dirname = File.dirname(File.expand_path(file))
15
+ lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
16
+ sets = {}
17
+ current_set = nil
18
+ sets_order = []
19
+ lines.each do |line|
20
+ if line =~ SET_RE
21
+ current_set = $1.dup
22
+ sets[current_set] = []
23
+ sets_order << current_set
24
+ else
25
+ full_path = (File.join(dirname,(line + ext)))
26
+ raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
27
+ sets[current_set] << full_path
28
+ end
29
+ end
30
+ [sets, sets_order]
31
+ end
32
+
33
+ # returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
34
+ # takes a hash of proteins to aaseqs. Uses a greedy algorithm where
35
+ # things are sorted first by the number of uniq amino acid sequences and total
36
+ # aa length. if a block is given, then will yield the prot and the
37
+ # peptide_array and sort by the returned value. The greedy algorithm acts on
38
+ # the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
39
+ # on the proteins in the minimal_protein_array and gives an array of other
40
+ # proteins.
41
+ def minimal_protein_set(proteins_to_aaseqs)
42
+ blk_given = block_given?
43
+ #STDERR.puts "using block for minimal_protein_set" if blk_given
44
+ proteins_and_uniq_peps = []
45
+
46
+ sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
47
+ if blk_given
48
+ yield(k,v)
49
+ else
50
+ [ v.size, v.inject(0){|m,s| m+s.size} ]
51
+ end
52
+ end.reverse
53
+
54
+ found_seq = Set.new
55
+
56
+ same_peptide_hits = {}
57
+
58
+ last_peps = nil
59
+ last_uniq_prot = nil
60
+ sorted_most_to_least.each do |prot, peps|
61
+ sorted_peps = peps.sort # is it necessary to SORT?????????
62
+ uniq_peps = peps.select do |pep|
63
+ if found_seq.include?(pep)
64
+ false
65
+ else
66
+ found_seq.add pep
67
+ true
68
+ end
69
+ end
70
+ if uniq_peps.size > 0
71
+ proteins_and_uniq_peps << [prot, uniq_peps]
72
+ same_peptide_hits[prot] = []
73
+ last_peps = sorted_peps
74
+ last_uniq_prot = prot
75
+ else
76
+ if sorted_peps == last_peps
77
+ same_peptide_hits[last_uniq_prot] << prot
78
+ end
79
+ end
80
+ end
81
+ prot_to_uniq_peps_hash = {}
82
+ proteins_and_uniq_peps.each do |prot, uniq_peps|
83
+ prot_to_uniq_peps_hash[prot] = uniq_peps
84
+ end
85
+
86
+ [prot_to_uniq_peps_hash, same_peptide_hits]
87
+ end
88
+
89
+ def cutoffs_to_floats(ar)
90
+ ar.map do |v|
91
+ if v == 'nil' || v == '-'
92
+ nil
93
+ else
94
+ answ = v.to_f
95
+ end
96
+ end
97
+ end
98
+
99
+ # returns a hash keyed on protein id that yields an array:
100
+ # [#aaseq, #aaseq_and_charge, #total_hits]
101
+ def stats_per_prot(prot_to_peps, seq_to_hits)
102
+ per_protein_hash = {}
103
+ prot_to_peps.each do |prot, uniq_pep_seqs|
104
+ all = Set.new
105
+ aaseqcharges = Set.new
106
+ aaseqs = Set.new
107
+
108
+ uniq_pep_seqs.each do |pep_seq|
109
+ all_hits = seq_to_hits[pep_seq]
110
+ all.merge( all_hits )
111
+ all_hits.each do |hit|
112
+ aaseq = hit.sequence
113
+ aaseqs.add( aaseq )
114
+ aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
115
+ end
116
+ per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
117
+
118
+ end
119
+ end
120
+ per_protein_hash
121
+ end
122
+
123
+ opt = {
124
+ :cutoffs => [nil],
125
+ :outfile => "summary.yml",
126
+ }
127
+
128
+ opts = OptionParser.new do |op|
129
+ op.banner = "usage: #{File.basename(__FILE__)} pepcentric_db.yml sets_compare.txt"
130
+ op.separator "output: #{opt[:outfile]}"
131
+ op.separator ""
132
+ op.separator "input: "
133
+ op.separator " each <file> referenced in sets_compare.txt should have a"
134
+ op.separator " <file>.phq.tsv file"
135
+ op.separator ""
136
+ op.separator "options:"
137
+ op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
138
+ op.separator ""
139
+ op.separator "formats:"
140
+ op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
141
+ op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
142
+ op.on("--pepcentric-db-format", "prints peptide centric db format and exits") {|v| opt[:pepcentric_db_format] = v }
143
+ end
144
+
145
+ # later on we could implement full isoform resolution like IsoformResolver
146
+ # for now we will generate a report, realizing that some isoforms may not be
147
+ # reported
148
+ # it is implemented by using a pre-made map from sequence to protein groups
149
+ # then, a set of sequences allows one to deduce all the relationships from the
150
+ # protein groups.
151
+
152
+ opts.parse!
153
+
154
+ pd = Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER
155
+ kvd = Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER
156
+
157
+ if opt[:pepcentric_db_format]
158
+ puts "pepcentric_db.yml needs to be in the format:"
159
+ puts "<PEPTIDE>#{kvd.inspect}<ID>#{pd.inspect}<ID>#{pd.inspect}<ID>"
160
+ puts "(The delimiters are shown with #inspect)"
161
+ end
162
+
163
+ if opt[:output_format]
164
+ yaml = <<SKEL
165
+ results:
166
+ - qvalue_cutoff: <Float>
167
+ sets:
168
+ <set_name>:
169
+ num_uniq_aaseqs: <Integer>
170
+ num_aaseqs_not_in_pep_db: <Integer>
171
+ num_uniq_aaseqs_charge: <Integer>
172
+ proteins:
173
+ <protein_id>:
174
+ num_hits_all:
175
+ - <Integer> # total num aaseqs
176
+ - <Integer> # total num aaseq+charge "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
177
+ op.on("--pepcentric-db-
178
+ - <Integer> # total num hits
179
+ num_hits_minimal:
180
+ - <Integer> # total num aaseqs
181
+ - <Integer> # total num aaseq+charge
182
+ - <Integer> # total num hits
183
+ indistinguishable:
184
+ - <protein_id>
185
+ - <protein_id>
186
+ aaseqs:
187
+ - <String>
188
+ - <String>
189
+ sets_order:
190
+ - <String>
191
+ - <String>
192
+ SKEL
193
+ print yaml
194
+ end
195
+
196
+ if opt[:input_format]
197
+ string =<<EXPLANATION
198
+ # the sets_compare.txt format is very simple:
199
+
200
+ Set <some_name_for_set1>
201
+ filename1_no_ext
202
+ filename2_no_ext
203
+ Set <some_name_for_set2>
204
+ filename3_no_ext
205
+ filename4_no_ext
206
+ ...
207
+ EXPLANATION
208
+ puts string
209
+ end
210
+
211
+ exit if opt.keys.any? {|key| key.to_s =~ /_format/ }
212
+
213
+ if ARGV.size != 2
214
+ p opts
215
+ puts opts.to_s
216
+ exit
217
+ end
218
+
219
+ (pepcentric_fn, sets_compare_fn) = ARGV
220
+
221
+ results = {}
222
+
223
+ results['results'] = []
224
+
225
+ (sets_hash, sets_order) = sets_compare_to_paths(sets_compare_fn)
226
+ results['sets_order'] = sets_order
227
+
228
+ STDERR.print "Loading peptide centric DB (this takes about a minute)..."
229
+ start = Time.now
230
+ Mspire::Ident::Peptide::Db::IO.open(pepcentric_fn) do |pep_to_prots|
231
+ STDERR.puts "#{Time.now - start} seconds."
232
+
233
+ opt[:cutoffs].each do |cutoff|
234
+
235
+ cutoff_results = {'qvalue_cutoff' => cutoff}
236
+ results_sets_hash = {}
237
+ cutoff_results['sets'] = results_sets_hash
238
+ results['results'] << cutoff_results
239
+
240
+ #########################
241
+ # FOR EACH SET:
242
+ #########################
243
+ pep_klass = nil
244
+ sets_hash.each do |set, files|
245
+ set_results = {}
246
+ results_sets_hash[set] = set_results
247
+
248
+ # assumes the indices are the same into each data file
249
+
250
+ # get the complete set of passing hits
251
+ all_passing_hits = files.inject([]) do |all_passing_hits, file|
252
+ hash = YAML.load_file(file)
253
+
254
+ header_hash = hash['headers']
255
+ pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
256
+ hits = hash['data'].map {|v| pep_klass.new(*v) }
257
+
258
+ passing_hits =
259
+ if cutoff
260
+ # assumes monotonic qvalues values!
261
+ (above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
262
+ above
263
+ else
264
+ hits
265
+ end
266
+ all_passing_hits.push(*passing_hits)
267
+ end
268
+
269
+
270
+ # create an index from aaseq to hits
271
+ seq_to_hits = Hash.new {|h,k| h[k] = []}
272
+ uniq_seqcharge = Set.new
273
+ all_passing_hits.each do |hit|
274
+ seq_to_hits[hit.sequence] << hit
275
+ uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
276
+ end
277
+
278
+
279
+ # determine the number of uniq aaseqs
280
+ uniq_seqs = seq_to_hits.size
281
+
282
+ num_uniq_seqcharges = uniq_seqcharge.size
283
+
284
+ set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
285
+ 'num_uniq_aaseqs' => uniq_seqs,
286
+ 'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
287
+ })
288
+
289
+ # create an index from proteins to peptides
290
+ prots_to_peps = Hash.new {|h,k| h[k] = [] }
291
+ peptides_not_found = []
292
+ seq_to_hits.keys.each do |seq|
293
+ if pep_db.key?(seq)
294
+ pep_db[seq].each do |prot|
295
+ prots_to_peps[prot] << seq
296
+ end
297
+ else
298
+ peptides_not_found << seq
299
+ end
300
+ end
301
+
302
+ # Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
303
+ stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
304
+
305
+ # get the minimal protein set
306
+ (prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
307
+ # will sort with lowest
308
+ [ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
309
+ end
310
+
311
+ prot_to_uniq_peps_hash.each do |prot, peps|
312
+ [prot, *indistinguishable_protein_hash[prot]].each do |prot|
313
+ protein_info[prot] = prot_header_hash[prot]
314
+ end
315
+ end
316
+
317
+ stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
318
+
319
+ # create a hash of data for each protein
320
+ protein_data_hashes_hash = {}
321
+ prot_to_uniq_peps_hash.each do |prot, peps|
322
+ protein_data_hashes_hash[prot] = {
323
+ 'aaseqs' => peps,
324
+ # this will be a triplet
325
+ 'num_hits_minimal' => stats_per_protein_minimal[prot],
326
+ 'indistinguishable' => indistinguishable_protein_hash[prot],
327
+ 'num_hits_all' => stats_per_protein_before[prot],
328
+ }
329
+ end
330
+
331
+ set_results['proteins'] = protein_data_hashes_hash
332
+ set_results['num_proteins'] = prot_to_uniq_peps_hash.size
333
+ set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
334
+ if peptides_not_found.size > 0
335
+ warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
336
+ end
337
+ end
338
+ end
339
+
340
+ File.open(opt[:outfile], 'w') do |out|
341
+ out.print results.to_yaml
342
+ end
343
+
344
+ end
345
+
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mspire/mzml'
4
+ require 'optparse'
5
+
6
+ opt = {
7
+ filter_zero_intensity: true,
8
+ retention_times: true,
9
+ }
10
+ opts = OptionParser.new do |op|
11
+ op.banner = "usage: #{File.basename($0)} <file>.mzML ..."
12
+ op.separator "outputs: <file>.mgf"
13
+ #op.on("--no-filter-zeros", "won't remove values with zero intensity") {|v| opt[:filter_zero_intensity] = false }
14
+ # the default is set in ms/msrun/search.rb -> set_opts
15
+ op.on("--no-retention-times", "won't include RT even if available") {|v| opt[:retention_times] = false }
16
+ end
17
+
18
+ opts.parse!
19
+
20
+ if ARGV.size == 0
21
+ puts opts
22
+ exit
23
+ end
24
+
25
+ ARGV.each do |file|
26
+ if File.exist?(file)
27
+ Mspire::Mzml.foreach(file).with_index do |spectrum,i|
28
+ next unless spectrum.ms_level > 1
29
+ puts "BEGIN IONS"
30
+ # id, spectrumid,
31
+ rt = spectrum.retention_time
32
+ title = [i, "id_#{spectrum.id}", "rt_#{rt.round}"].join('.')
33
+ puts "TITLE=#{title}"
34
+ puts "RTINSECONDS=#{rt}" if opt[:retention_times]
35
+ puts "PEPMASS=#{spectrum.precursor_mz}"
36
+ puts "CHARGE=#{spectrum.precursor_charge}+"
37
+ spectrum.each do |mz,int|
38
+ puts [mz, int].join(" ")
39
+ end
40
+ puts "END IONS"
41
+ puts ""
42
+ end
43
+ else
44
+ puts "missing file: #{file} [skipping]"
45
+ end
46
+ end