mspire 0.8.6 → 0.8.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/mspire.rb +1 -0
- data/script/accession_quantifier.rb +176 -0
- data/script/mzml_to_mgf.rb +30 -13
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +15 -1
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.6
|
1
|
+
0.8.6.1
|
data/lib/mspire.rb
CHANGED
@@ -3,4 +3,5 @@ require 'mspire/mass/aa' # requires mspire/mass & therefore mspire/molecular_for
|
|
3
3
|
|
4
4
|
module Mspire
|
5
5
|
VERSION = IO.read(File.join(File.dirname(__FILE__), '..', 'VERSION')).chomp
|
6
|
+
CITE = "Prince JT, Marcotte EM. mspire: mass spectrometry proteomics in Ruby. Bioinformatics. 2008. 24(23):2796-7."
|
6
7
|
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'mspire/mzml'
|
8
|
+
require 'mspire/digester'
|
9
|
+
require 'mspire/mascot/dat'
|
10
|
+
|
11
|
+
|
12
|
+
class Array
|
13
|
+
|
14
|
+
def sum
|
15
|
+
inject( nil ) { |sum,x| sum ? sum + x.to_f : x.to_f }
|
16
|
+
end
|
17
|
+
|
18
|
+
def weighted_mean(weights_array)
|
19
|
+
w_sum = weights_array.sum
|
20
|
+
w_prod = 0
|
21
|
+
self.each_index {|i| w_prod += self[i] * weights_array[i].to_f}
|
22
|
+
w_prod.to_f / w_sum.to_f
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
opt = OpenStruct.new( {
|
28
|
+
max_rt_before: 60,
|
29
|
+
max_rt_after: 60,
|
30
|
+
mz_window: 0.01,
|
31
|
+
scan_id_regex: Regexp.new("(.*)"),
|
32
|
+
# the regex I use:
|
33
|
+
#scan_id_regex: Regexp.new("id_([^\\.]+)"),
|
34
|
+
} )
|
35
|
+
|
36
|
+
|
37
|
+
opts = OptionParser.new do |op|
|
38
|
+
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <mzML> <dat> <accession> ..."
|
39
|
+
op.separator "output: <TBD>"
|
40
|
+
op.separator ""
|
41
|
+
op.separator "options: "
|
42
|
+
op.on("--max_rt_before <#{opt.max_rt_before}>", Float, "(sec) max RT to look before") {v| opt.max_rt_before = v }
|
43
|
+
op.on("--max_rt_after <#{opt.max_rt_after}>", Float, "(sec) max RT to look after") {v| opt.max_rt_after = v }
|
44
|
+
op.on("--mz_window <#{opt.mz_window}>", Float, "(Th) window around m/z value") {|v| opt.mz_window = v }
|
45
|
+
op.on("--scan_id_regex <#{opt.scan_id_regex.source}>", "scan") {|v| opt.scan_id_regex = Regexp.new(v) }
|
46
|
+
op.on("--add-filename", "adds the filename to output files") {|v| opt.add_filename = v }
|
47
|
+
end
|
48
|
+
opts.parse!
|
49
|
+
|
50
|
+
if ARGV.size < 3
|
51
|
+
puts opts
|
52
|
+
exit
|
53
|
+
end
|
54
|
+
|
55
|
+
(mzml_file, dat_file, *accessions_array) = ARGV
|
56
|
+
|
57
|
+
accessions = Set.new(accessions_array)
|
58
|
+
|
59
|
+
# block yields the retention time in seconds and stops iteration if the block returns nil/false
|
60
|
+
def create_chromatogram(mzml, index_enum, mz, mz_window, ms_level=1, &block)
|
61
|
+
chromatogram = []
|
62
|
+
while index=index_enum.next
|
63
|
+
break unless spectrum=mzml[index]
|
64
|
+
next unless ms_level===spectrum.ms_level
|
65
|
+
break unless block.call( spectrum.retention_time )
|
66
|
+
mzs = spectrum.mzs
|
67
|
+
ints = spectrum.intensities
|
68
|
+
index = spectrum.find_nearest_index(mz)
|
69
|
+
|
70
|
+
lwin_mz = mz - (mz_window/2.0)
|
71
|
+
hwin_mz = mz + (mz_window/2.0)
|
72
|
+
|
73
|
+
|
74
|
+
ints_in_range = []
|
75
|
+
index.upto(Float::INFINITY) do |i|
|
76
|
+
if mzs[i] <= hwin_mz
|
77
|
+
ints_in_range << ints[i]
|
78
|
+
else
|
79
|
+
break
|
80
|
+
end
|
81
|
+
end
|
82
|
+
(index-1).downto(0) do |i|
|
83
|
+
if mzs[i] >= lwin_mz
|
84
|
+
ints_in_range << ints[i]
|
85
|
+
else
|
86
|
+
break
|
87
|
+
end
|
88
|
+
end
|
89
|
+
if ints_in_range.size > 0
|
90
|
+
chromatogram << [spectrum.retention_time, ints_in_range.reduce(:+)]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
chromatogram
|
94
|
+
end
|
95
|
+
|
96
|
+
Pephit = Struct.new(:spectrum_id, :exp_mz, :charge, :seq, :accessions, :var_mods_string, :chromatogram)
|
97
|
+
|
98
|
+
pephits = []
|
99
|
+
Mspire::Mascot::Dat.open(dat_file) do |dat|
|
100
|
+
dat.each_peptide(1) do |pephit|
|
101
|
+
intersecting_accessions = accessions & pephit.protein_hits_info.map(&:accession)
|
102
|
+
if intersecting_accessions.size > 0
|
103
|
+
query = dat.query(pephit.query_num)
|
104
|
+
z = query.charge
|
105
|
+
exp_mr = pephit.mr + pephit.delta
|
106
|
+
exp_mz = (exp_mr + (z * Mspire::Mass::H_PLUS)) / z
|
107
|
+
md=opt.scan_id_regex.match(query.title)
|
108
|
+
if md
|
109
|
+
spectrum_id = md[1]
|
110
|
+
end
|
111
|
+
pephits << Pephit.new(spectrum_id, exp_mz, z, pephit.seq, intersecting_accessions.to_a, pephit.var_mods_string)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
puts "Found: #{pephits.size} pephits"
|
117
|
+
exit unless pephits.size > 0
|
118
|
+
|
119
|
+
Mspire::Mzml.open(mzml_file) do |mzml|
|
120
|
+
spec_index = mzml.index_list[:spectrum]
|
121
|
+
|
122
|
+
tic = mzml.map {|spec| spec.fetch_by_acc('MS:1000285').to_f }.reduce(:+)
|
123
|
+
divisor = tic.to_f/1e7
|
124
|
+
|
125
|
+
id_to_index = {}
|
126
|
+
spec_index.ids.each_with_index {|id,index| id_to_index[id] = index }
|
127
|
+
|
128
|
+
|
129
|
+
pephits.each do |pephit|
|
130
|
+
print "." ; $stdout.flush
|
131
|
+
|
132
|
+
ms1_spec_id = mzml[pephit.spectrum_id].precursors.first.spectrum_id
|
133
|
+
index = id_to_index[ms1_spec_id]
|
134
|
+
spectrum = mzml[index]
|
135
|
+
|
136
|
+
orig_rt = spectrum.retention_time
|
137
|
+
lo_rt = orig_rt - opt.max_rt_before
|
138
|
+
hi_rt = orig_rt + opt.max_rt_after
|
139
|
+
|
140
|
+
first_chunk = create_chromatogram(mzml, index.downto(0), pephit.exp_mz, opt.mz_window) {|rt| rt >= lo_rt }
|
141
|
+
last_chunk = create_chromatogram(mzml, (index+1).upto(Float::INFINITY), pephit.exp_mz, opt.mz_window) {|rt| rt <= hi_rt }
|
142
|
+
|
143
|
+
chromatogram = (first_chunk + last_chunk).sort
|
144
|
+
chromatogram.each {|pair| pair[1] /= divisor }
|
145
|
+
|
146
|
+
pephit.chromatogram = chromatogram
|
147
|
+
end
|
148
|
+
end
|
149
|
+
puts "finished with mzml"
|
150
|
+
|
151
|
+
pephits.group_by {|pephit| [pephit.seq, pephit.charge, pephit.var_mods_string] }.map do |group, sub_pephits|
|
152
|
+
puts "grouping: #{group.join(', ')}"
|
153
|
+
avg_exp_mz = sub_pephits.map(&:exp_mz).reduce(:+) / sub_pephits.size
|
154
|
+
new_chrom = sub_pephits.flat_map(&:chromatogram).uniq.sort
|
155
|
+
cpephit = Pephit.new("(#{sub_pephits.size})", avg_exp_mz, *[:charge, :seq, :accessions, :var_mods_string].map {|key| sub_pephits.first.send(key) }, new_chrom)
|
156
|
+
|
157
|
+
fileparts = [cpephit.seq, cpephit.charge, cpephit.var_mods_string]
|
158
|
+
if opt.add_filename
|
159
|
+
fileparts.unshift(dat_file.chomp(File.extname(dat_file)))
|
160
|
+
end
|
161
|
+
filename = fileparts.join(".") + ".tsv"
|
162
|
+
|
163
|
+
puts "writing: #{filename}"
|
164
|
+
File.open(filename, 'w') do |out|
|
165
|
+
cpephit.each_pair do |k,v|
|
166
|
+
out.puts "# #{k}: #{v}" unless k.to_sym == :chromatogram
|
167
|
+
end
|
168
|
+
out.puts
|
169
|
+
out.puts "rt(sec)\tnorm_intensity"
|
170
|
+
cpephit.chromatogram.each do |row|
|
171
|
+
out.puts row.join("\t")
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
|
data/script/mzml_to_mgf.rb
CHANGED
@@ -3,6 +3,13 @@
|
|
3
3
|
require 'mspire/mzml'
|
4
4
|
require 'optparse'
|
5
5
|
|
6
|
+
|
7
|
+
# returns '3+' for 3 or '2-' for -2
|
8
|
+
def mascot_charge(val)
|
9
|
+
"#{val}#{val > 0 ? '+' : '-'}"
|
10
|
+
end
|
11
|
+
|
12
|
+
|
6
13
|
opt = {
|
7
14
|
filter_zero_intensity: true,
|
8
15
|
retention_times: true,
|
@@ -10,7 +17,7 @@ opt = {
|
|
10
17
|
opts = OptionParser.new do |op|
|
11
18
|
op.banner = "usage: #{File.basename($0)} <file>.mzML ..."
|
12
19
|
op.separator "outputs: <file>.mgf"
|
13
|
-
|
20
|
+
op.on("--no-filter-zeros", "won't remove values with zero intensity") {|v| opt[:filter_zero_intensity] = false }
|
14
21
|
# the default is set in ms/msrun/search.rb -> set_opts
|
15
22
|
op.on("--no-retention-times", "won't include RT even if available") {|v| opt[:retention_times] = false }
|
16
23
|
end
|
@@ -22,25 +29,35 @@ if ARGV.size == 0
|
|
22
29
|
exit
|
23
30
|
end
|
24
31
|
|
32
|
+
filter_zeros = opt[:filter_zero_intensity]
|
33
|
+
|
25
34
|
ARGV.each do |file|
|
26
|
-
|
35
|
+
basename = file.chomp(File.extname(file))
|
36
|
+
outfile = basename + ".mgf"
|
37
|
+
|
38
|
+
File.open(outfile, 'w') do |out|
|
27
39
|
Mspire::Mzml.foreach(file).with_index do |spectrum,i|
|
28
40
|
next unless spectrum.ms_level > 1
|
29
|
-
puts "BEGIN IONS"
|
41
|
+
out.puts "BEGIN IONS"
|
30
42
|
# id, spectrumid,
|
31
43
|
rt = spectrum.retention_time
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
puts "
|
36
|
-
puts "
|
44
|
+
title_ar = [i, "id_#{spectrum.id}"]
|
45
|
+
title_ar.push("rt_#{rt.round}") if opt[:retention_times]
|
46
|
+
title = title_ar.join('.')
|
47
|
+
out.puts "TITLE=#{title}"
|
48
|
+
out.puts "RTINSECONDS=#{rt}" if opt[:retention_times]
|
49
|
+
out.puts "PEPMASS=#{spectrum.precursor_mz}"
|
50
|
+
if z=spectrum.precursor_charge
|
51
|
+
out.puts "CHARGE=#{mascot_charge(z)}"
|
52
|
+
end
|
53
|
+
|
37
54
|
spectrum.each do |mz,int|
|
38
|
-
|
55
|
+
unless filter_zeros && (int==0.0)
|
56
|
+
out.puts([mz, int].join(" "))
|
57
|
+
end
|
39
58
|
end
|
40
|
-
puts "END IONS"
|
41
|
-
puts ""
|
59
|
+
out.puts "END IONS"
|
60
|
+
out.puts ""
|
42
61
|
end
|
43
|
-
else
|
44
|
-
puts "missing file: #{file} [skipping]"
|
45
62
|
end
|
46
63
|
end
|
@@ -4,6 +4,7 @@ require 'andand'
|
|
4
4
|
require 'set'
|
5
5
|
require 'ruport'
|
6
6
|
|
7
|
+
require 'mspire'
|
7
8
|
require 'mspire/ident/peptide_hit/qvalue'
|
8
9
|
require 'mspire/ident/peptide_hit'
|
9
10
|
require 'mspire/ident/protein_group'
|
@@ -96,11 +97,14 @@ group names can be arbitrarily defined
|
|
96
97
|
opt :qspec_decibans, "report bayesfactor in decibans"
|
97
98
|
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
98
99
|
opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
|
100
|
+
opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
|
99
101
|
opt :write_subset, "(dev use only) write subset db", :default => false
|
100
102
|
end
|
101
103
|
|
104
|
+
commandline_incantation = __FILE__ + " " + ARGV.join(" ")
|
102
105
|
opt = opts.parse(ARGV)
|
103
106
|
opt[:count_type] = opt[:count_type].to_sym
|
107
|
+
outfile = opt[:outfile] || outfile
|
104
108
|
|
105
109
|
$VERBOSE = opt.delete(:verbose)
|
106
110
|
|
@@ -271,5 +275,15 @@ if opt[:peptides]
|
|
271
275
|
hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
|
272
276
|
end
|
273
277
|
|
274
|
-
intro = [
|
278
|
+
intro = [
|
279
|
+
"",
|
280
|
+
"ruby: #{RUBY_VERSION}",
|
281
|
+
"software: mspire #{Mspire::VERSION}",
|
282
|
+
"cite: #{Mspire::CITE}",
|
283
|
+
"samples: #{samplename_to_filename}",
|
284
|
+
"options: #{opt}",
|
285
|
+
"commandline: #{commandline_incantation}"
|
286
|
+
]
|
287
|
+
intro.insert(3, "version_tag: #{opt[:version_tag]}") if opt[:version_tag]
|
288
|
+
|
275
289
|
counts_table.to_tsv(outfile, :footer => intro)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mspire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.6
|
4
|
+
version: 0.8.6.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-04-
|
13
|
+
date: 2013-04-16 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
@@ -314,6 +314,7 @@ files:
|
|
314
314
|
- obo/ms.obo
|
315
315
|
- obo/unit.obo
|
316
316
|
- schema/peptide_hit_qvalues.pqh.tsv
|
317
|
+
- script/accession_quantifier.rb
|
317
318
|
- script/download_uniprotkb_db.rb
|
318
319
|
- script/fasta_to_peptide_centric_db.rb
|
319
320
|
- script/mascot_dat_to_peptide_hit_qvalues.rb
|