mspire 0.8.6 → 0.8.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/mspire.rb +1 -0
- data/script/accession_quantifier.rb +176 -0
- data/script/mzml_to_mgf.rb +30 -13
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +15 -1
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.6
|
1
|
+
0.8.6.1
|
data/lib/mspire.rb
CHANGED
@@ -3,4 +3,5 @@ require 'mspire/mass/aa' # requires mspire/mass & therefore mspire/molecular_for
|
|
3
3
|
|
4
4
|
module Mspire
|
5
5
|
VERSION = IO.read(File.join(File.dirname(__FILE__), '..', 'VERSION')).chomp
|
6
|
+
CITE = "Prince JT, Marcotte EM. mspire: mass spectrometry proteomics in Ruby. Bioinformatics. 2008. 24(23):2796-7."
|
6
7
|
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'mspire/mzml'
|
8
|
+
require 'mspire/digester'
|
9
|
+
require 'mspire/mascot/dat'
|
10
|
+
|
11
|
+
|
12
|
+
class Array
|
13
|
+
|
14
|
+
def sum
|
15
|
+
inject( nil ) { |sum,x| sum ? sum + x.to_f : x.to_f }
|
16
|
+
end
|
17
|
+
|
18
|
+
def weighted_mean(weights_array)
|
19
|
+
w_sum = weights_array.sum
|
20
|
+
w_prod = 0
|
21
|
+
self.each_index {|i| w_prod += self[i] * weights_array[i].to_f}
|
22
|
+
w_prod.to_f / w_sum.to_f
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
opt = OpenStruct.new( {
|
28
|
+
max_rt_before: 60,
|
29
|
+
max_rt_after: 60,
|
30
|
+
mz_window: 0.01,
|
31
|
+
scan_id_regex: Regexp.new("(.*)"),
|
32
|
+
# the regex I use:
|
33
|
+
#scan_id_regex: Regexp.new("id_([^\\.]+)"),
|
34
|
+
} )
|
35
|
+
|
36
|
+
|
37
|
+
opts = OptionParser.new do |op|
|
38
|
+
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <mzML> <dat> <accession> ..."
|
39
|
+
op.separator "output: <TBD>"
|
40
|
+
op.separator ""
|
41
|
+
op.separator "options: "
|
42
|
+
op.on("--max_rt_before <#{opt.max_rt_before}>", Float, "(sec) max RT to look before") {v| opt.max_rt_before = v }
|
43
|
+
op.on("--max_rt_after <#{opt.max_rt_after}>", Float, "(sec) max RT to look after") {v| opt.max_rt_after = v }
|
44
|
+
op.on("--mz_window <#{opt.mz_window}>", Float, "(Th) window around m/z value") {|v| opt.mz_window = v }
|
45
|
+
op.on("--scan_id_regex <#{opt.scan_id_regex.source}>", "scan") {|v| opt.scan_id_regex = Regexp.new(v) }
|
46
|
+
op.on("--add-filename", "adds the filename to output files") {|v| opt.add_filename = v }
|
47
|
+
end
|
48
|
+
opts.parse!
|
49
|
+
|
50
|
+
if ARGV.size < 3
|
51
|
+
puts opts
|
52
|
+
exit
|
53
|
+
end
|
54
|
+
|
55
|
+
(mzml_file, dat_file, *accessions_array) = ARGV
|
56
|
+
|
57
|
+
accessions = Set.new(accessions_array)
|
58
|
+
|
59
|
+
# block yields the retention time in seconds and stops iteration if the block returns nil/false
|
60
|
+
def create_chromatogram(mzml, index_enum, mz, mz_window, ms_level=1, &block)
|
61
|
+
chromatogram = []
|
62
|
+
while index=index_enum.next
|
63
|
+
break unless spectrum=mzml[index]
|
64
|
+
next unless ms_level===spectrum.ms_level
|
65
|
+
break unless block.call( spectrum.retention_time )
|
66
|
+
mzs = spectrum.mzs
|
67
|
+
ints = spectrum.intensities
|
68
|
+
index = spectrum.find_nearest_index(mz)
|
69
|
+
|
70
|
+
lwin_mz = mz - (mz_window/2.0)
|
71
|
+
hwin_mz = mz + (mz_window/2.0)
|
72
|
+
|
73
|
+
|
74
|
+
ints_in_range = []
|
75
|
+
index.upto(Float::INFINITY) do |i|
|
76
|
+
if mzs[i] <= hwin_mz
|
77
|
+
ints_in_range << ints[i]
|
78
|
+
else
|
79
|
+
break
|
80
|
+
end
|
81
|
+
end
|
82
|
+
(index-1).downto(0) do |i|
|
83
|
+
if mzs[i] >= lwin_mz
|
84
|
+
ints_in_range << ints[i]
|
85
|
+
else
|
86
|
+
break
|
87
|
+
end
|
88
|
+
end
|
89
|
+
if ints_in_range.size > 0
|
90
|
+
chromatogram << [spectrum.retention_time, ints_in_range.reduce(:+)]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
chromatogram
|
94
|
+
end
|
95
|
+
|
96
|
+
Pephit = Struct.new(:spectrum_id, :exp_mz, :charge, :seq, :accessions, :var_mods_string, :chromatogram)
|
97
|
+
|
98
|
+
pephits = []
|
99
|
+
Mspire::Mascot::Dat.open(dat_file) do |dat|
|
100
|
+
dat.each_peptide(1) do |pephit|
|
101
|
+
intersecting_accessions = accessions & pephit.protein_hits_info.map(&:accession)
|
102
|
+
if intersecting_accessions.size > 0
|
103
|
+
query = dat.query(pephit.query_num)
|
104
|
+
z = query.charge
|
105
|
+
exp_mr = pephit.mr + pephit.delta
|
106
|
+
exp_mz = (exp_mr + (z * Mspire::Mass::H_PLUS)) / z
|
107
|
+
md=opt.scan_id_regex.match(query.title)
|
108
|
+
if md
|
109
|
+
spectrum_id = md[1]
|
110
|
+
end
|
111
|
+
pephits << Pephit.new(spectrum_id, exp_mz, z, pephit.seq, intersecting_accessions.to_a, pephit.var_mods_string)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
puts "Found: #{pephits.size} pephits"
|
117
|
+
exit unless pephits.size > 0
|
118
|
+
|
119
|
+
Mspire::Mzml.open(mzml_file) do |mzml|
|
120
|
+
spec_index = mzml.index_list[:spectrum]
|
121
|
+
|
122
|
+
tic = mzml.map {|spec| spec.fetch_by_acc('MS:1000285').to_f }.reduce(:+)
|
123
|
+
divisor = tic.to_f/1e7
|
124
|
+
|
125
|
+
id_to_index = {}
|
126
|
+
spec_index.ids.each_with_index {|id,index| id_to_index[id] = index }
|
127
|
+
|
128
|
+
|
129
|
+
pephits.each do |pephit|
|
130
|
+
print "." ; $stdout.flush
|
131
|
+
|
132
|
+
ms1_spec_id = mzml[pephit.spectrum_id].precursors.first.spectrum_id
|
133
|
+
index = id_to_index[ms1_spec_id]
|
134
|
+
spectrum = mzml[index]
|
135
|
+
|
136
|
+
orig_rt = spectrum.retention_time
|
137
|
+
lo_rt = orig_rt - opt.max_rt_before
|
138
|
+
hi_rt = orig_rt + opt.max_rt_after
|
139
|
+
|
140
|
+
first_chunk = create_chromatogram(mzml, index.downto(0), pephit.exp_mz, opt.mz_window) {|rt| rt >= lo_rt }
|
141
|
+
last_chunk = create_chromatogram(mzml, (index+1).upto(Float::INFINITY), pephit.exp_mz, opt.mz_window) {|rt| rt <= hi_rt }
|
142
|
+
|
143
|
+
chromatogram = (first_chunk + last_chunk).sort
|
144
|
+
chromatogram.each {|pair| pair[1] /= divisor }
|
145
|
+
|
146
|
+
pephit.chromatogram = chromatogram
|
147
|
+
end
|
148
|
+
end
|
149
|
+
puts "finished with mzml"
|
150
|
+
|
151
|
+
pephits.group_by {|pephit| [pephit.seq, pephit.charge, pephit.var_mods_string] }.map do |group, sub_pephits|
|
152
|
+
puts "grouping: #{group.join(', ')}"
|
153
|
+
avg_exp_mz = sub_pephits.map(&:exp_mz).reduce(:+) / sub_pephits.size
|
154
|
+
new_chrom = sub_pephits.flat_map(&:chromatogram).uniq.sort
|
155
|
+
cpephit = Pephit.new("(#{sub_pephits.size})", avg_exp_mz, *[:charge, :seq, :accessions, :var_mods_string].map {|key| sub_pephits.first.send(key) }, new_chrom)
|
156
|
+
|
157
|
+
fileparts = [cpephit.seq, cpephit.charge, cpephit.var_mods_string]
|
158
|
+
if opt.add_filename
|
159
|
+
fileparts.unshift(dat_file.chomp(File.extname(dat_file)))
|
160
|
+
end
|
161
|
+
filename = fileparts.join(".") + ".tsv"
|
162
|
+
|
163
|
+
puts "writing: #{filename}"
|
164
|
+
File.open(filename, 'w') do |out|
|
165
|
+
cpephit.each_pair do |k,v|
|
166
|
+
out.puts "# #{k}: #{v}" unless k.to_sym == :chromatogram
|
167
|
+
end
|
168
|
+
out.puts
|
169
|
+
out.puts "rt(sec)\tnorm_intensity"
|
170
|
+
cpephit.chromatogram.each do |row|
|
171
|
+
out.puts row.join("\t")
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
|
data/script/mzml_to_mgf.rb
CHANGED
@@ -3,6 +3,13 @@
|
|
3
3
|
require 'mspire/mzml'
|
4
4
|
require 'optparse'
|
5
5
|
|
6
|
+
|
7
|
+
# returns '3+' for 3 or '2-' for -2
|
8
|
+
def mascot_charge(val)
|
9
|
+
"#{val}#{val > 0 ? '+' : '-'}"
|
10
|
+
end
|
11
|
+
|
12
|
+
|
6
13
|
opt = {
|
7
14
|
filter_zero_intensity: true,
|
8
15
|
retention_times: true,
|
@@ -10,7 +17,7 @@ opt = {
|
|
10
17
|
opts = OptionParser.new do |op|
|
11
18
|
op.banner = "usage: #{File.basename($0)} <file>.mzML ..."
|
12
19
|
op.separator "outputs: <file>.mgf"
|
13
|
-
|
20
|
+
op.on("--no-filter-zeros", "won't remove values with zero intensity") {|v| opt[:filter_zero_intensity] = false }
|
14
21
|
# the default is set in ms/msrun/search.rb -> set_opts
|
15
22
|
op.on("--no-retention-times", "won't include RT even if available") {|v| opt[:retention_times] = false }
|
16
23
|
end
|
@@ -22,25 +29,35 @@ if ARGV.size == 0
|
|
22
29
|
exit
|
23
30
|
end
|
24
31
|
|
32
|
+
filter_zeros = opt[:filter_zero_intensity]
|
33
|
+
|
25
34
|
ARGV.each do |file|
|
26
|
-
|
35
|
+
basename = file.chomp(File.extname(file))
|
36
|
+
outfile = basename + ".mgf"
|
37
|
+
|
38
|
+
File.open(outfile, 'w') do |out|
|
27
39
|
Mspire::Mzml.foreach(file).with_index do |spectrum,i|
|
28
40
|
next unless spectrum.ms_level > 1
|
29
|
-
puts "BEGIN IONS"
|
41
|
+
out.puts "BEGIN IONS"
|
30
42
|
# id, spectrumid,
|
31
43
|
rt = spectrum.retention_time
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
puts "
|
36
|
-
puts "
|
44
|
+
title_ar = [i, "id_#{spectrum.id}"]
|
45
|
+
title_ar.push("rt_#{rt.round}") if opt[:retention_times]
|
46
|
+
title = title_ar.join('.')
|
47
|
+
out.puts "TITLE=#{title}"
|
48
|
+
out.puts "RTINSECONDS=#{rt}" if opt[:retention_times]
|
49
|
+
out.puts "PEPMASS=#{spectrum.precursor_mz}"
|
50
|
+
if z=spectrum.precursor_charge
|
51
|
+
out.puts "CHARGE=#{mascot_charge(z)}"
|
52
|
+
end
|
53
|
+
|
37
54
|
spectrum.each do |mz,int|
|
38
|
-
|
55
|
+
unless filter_zeros && (int==0.0)
|
56
|
+
out.puts([mz, int].join(" "))
|
57
|
+
end
|
39
58
|
end
|
40
|
-
puts "END IONS"
|
41
|
-
puts ""
|
59
|
+
out.puts "END IONS"
|
60
|
+
out.puts ""
|
42
61
|
end
|
43
|
-
else
|
44
|
-
puts "missing file: #{file} [skipping]"
|
45
62
|
end
|
46
63
|
end
|
@@ -4,6 +4,7 @@ require 'andand'
|
|
4
4
|
require 'set'
|
5
5
|
require 'ruport'
|
6
6
|
|
7
|
+
require 'mspire'
|
7
8
|
require 'mspire/ident/peptide_hit/qvalue'
|
8
9
|
require 'mspire/ident/peptide_hit'
|
9
10
|
require 'mspire/ident/protein_group'
|
@@ -96,11 +97,14 @@ group names can be arbitrarily defined
|
|
96
97
|
opt :qspec_decibans, "report bayesfactor in decibans"
|
97
98
|
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
98
99
|
opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
|
100
|
+
opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
|
99
101
|
opt :write_subset, "(dev use only) write subset db", :default => false
|
100
102
|
end
|
101
103
|
|
104
|
+
commandline_incantation = __FILE__ + " " + ARGV.join(" ")
|
102
105
|
opt = opts.parse(ARGV)
|
103
106
|
opt[:count_type] = opt[:count_type].to_sym
|
107
|
+
outfile = opt[:outfile] || outfile
|
104
108
|
|
105
109
|
$VERBOSE = opt.delete(:verbose)
|
106
110
|
|
@@ -271,5 +275,15 @@ if opt[:peptides]
|
|
271
275
|
hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
|
272
276
|
end
|
273
277
|
|
274
|
-
intro = [
|
278
|
+
intro = [
|
279
|
+
"",
|
280
|
+
"ruby: #{RUBY_VERSION}",
|
281
|
+
"software: mspire #{Mspire::VERSION}",
|
282
|
+
"cite: #{Mspire::CITE}",
|
283
|
+
"samples: #{samplename_to_filename}",
|
284
|
+
"options: #{opt}",
|
285
|
+
"commandline: #{commandline_incantation}"
|
286
|
+
]
|
287
|
+
intro.insert(3, "version_tag: #{opt[:version_tag]}") if opt[:version_tag]
|
288
|
+
|
275
289
|
counts_table.to_tsv(outfile, :footer => intro)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mspire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.6
|
4
|
+
version: 0.8.6.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-04-
|
13
|
+
date: 2013-04-16 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
@@ -314,6 +314,7 @@ files:
|
|
314
314
|
- obo/ms.obo
|
315
315
|
- obo/unit.obo
|
316
316
|
- schema/peptide_hit_qvalues.pqh.tsv
|
317
|
+
- script/accession_quantifier.rb
|
317
318
|
- script/download_uniprotkb_db.rb
|
318
319
|
- script/fasta_to_peptide_centric_db.rb
|
319
320
|
- script/mascot_dat_to_peptide_hit_qvalues.rb
|