mzid 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/convert_mzid_to_csv +53 -0
- data/bin/load_helper.rb +2 -0
- data/lib/mzid.rb +14 -0
- data/lib/mzid/base_parser.rb +45 -0
- data/lib/mzid/batch_parser.rb +148 -0
- data/lib/mzid/filtered_streaming_parser.rb +257 -0
- data/lib/mzid/parser_sax.rb +292 -0
- data/lib/mzid/peptide_evidence.rb +39 -0
- data/lib/mzid/psm.rb +61 -0
- data/lib/mzid/streaming_parser.rb +177 -0
- data/lib/mzid/streaming_parser_lines.rb +179 -0
- data/lib/mzid/version.rb +3 -0
- data/tests/data/example.mzid +71 -0
- data/tests/data/example_2.mzid +118 -0
- data/tests/data/example_mod.mzid +112 -0
- data/tests/load_helper.rb +1 -0
- data/tests/test_all.rb +6 -0
- data/tests/test_batch_parser.rb +86 -0
- data/tests/test_default_parser.rb +72 -0
- data/tests/test_helper.rb +8 -0
- data/tests/test_parser_sax.rb +47 -0
- data/tests/test_psm.rb +15 -0
- data/tests/test_streaming_parser.rb +87 -0
- data/tests/test_streaming_parser_lines.rb +104 -0
- metadata +162 -0
@@ -0,0 +1,292 @@
|
|
1
|
+
require 'ox'
|
2
|
+
require 'progressbar'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module MzID
|
6
|
+
#
|
7
|
+
# class to parse an mzIdentML file (.mzid) in a memory efficient manner.
|
8
|
+
# can parse large files that a DOM parser will fail on, e.g., most mzid
|
9
|
+
# parsers. The caveat is that it must be written to a csv file.
|
10
|
+
#
|
11
|
+
class ParserSax
|
12
|
+
#
|
13
|
+
# counts the different element types
|
14
|
+
#
|
15
|
+
class CounterHandler < Ox::Sax
|
16
|
+
ATTR = [:DBSequence, :Peptide, :PeptideEvidence, :SpectrumIdentificationItem]
|
17
|
+
#
|
18
|
+
def initialize()
|
19
|
+
@dbseq_count = 0
|
20
|
+
@pep_count = 0
|
21
|
+
@pepev_count = 0
|
22
|
+
@spec_count = 0
|
23
|
+
end
|
24
|
+
attr_accessor :dbseq_count, :pep_count, :pepev_count, :spec_count
|
25
|
+
#
|
26
|
+
def start_element(name)
|
27
|
+
return unless ATTR.include?(name)
|
28
|
+
case name
|
29
|
+
when :DBSequence
|
30
|
+
@dbseq_count += 1
|
31
|
+
when :Peptide
|
32
|
+
@pep_count += 1
|
33
|
+
when :PeptideEvidence
|
34
|
+
@pepev_count += 1
|
35
|
+
when :SpectrumIdentificationItem
|
36
|
+
@spec_count += 1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
#
|
41
|
+
# handler for DBSequence elements
|
42
|
+
#
|
43
|
+
class DBSequenceHandler < Ox::Sax
|
44
|
+
ATTR = [:DBSequence]
|
45
|
+
#
|
46
|
+
def initialize(num_dbseq=nil)
|
47
|
+
@dbseq_h = Hash.new
|
48
|
+
@pbar = num_dbseq.nil? ? nil : ProgressBar.new("DBSeq", num_dbseq)
|
49
|
+
end
|
50
|
+
#
|
51
|
+
attr_accessor :dbseq_h, :pbar
|
52
|
+
#
|
53
|
+
def start_element(name)
|
54
|
+
@h = {} if name == :DBSequence
|
55
|
+
@curr_node = name
|
56
|
+
end
|
57
|
+
#
|
58
|
+
def attr(name, value)
|
59
|
+
return unless ATTR.include?(@curr_node)
|
60
|
+
@h[name] = value if name == :accession || name == :id
|
61
|
+
end
|
62
|
+
#
|
63
|
+
def value(value)
|
64
|
+
return unless ATTR.include?(@curr_node)
|
65
|
+
end
|
66
|
+
#
|
67
|
+
def end_element(name)
|
68
|
+
return unless name == :DBSequence
|
69
|
+
@pbar.inc if !@pbar.nil?
|
70
|
+
@dbseq_h[@h[:id].to_sym] = @h[:accession]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
#
|
74
|
+
# handler for Peptide elements
|
75
|
+
#
|
76
|
+
class PeptideHandler < Ox::Sax
|
77
|
+
ATTR = [:Peptide, :PeptideSequence, :Modification]
|
78
|
+
|
79
|
+
def initialize(num_pep=nil)
|
80
|
+
@pbar = num_pep.nil? ? nil : ProgressBar.new("Peptides", num_pep)
|
81
|
+
@pep_h = Hash.new
|
82
|
+
@mod_h = Hash.new
|
83
|
+
end
|
84
|
+
|
85
|
+
attr_accessor :pep_h, :mod_h, :pbar
|
86
|
+
#
|
87
|
+
def start_element(name)
|
88
|
+
@h = {} if name == :Peptide
|
89
|
+
@mh = [] if name == :Peptide
|
90
|
+
@curr_node = name
|
91
|
+
end
|
92
|
+
#
|
93
|
+
def attr(name, value)
|
94
|
+
return unless ATTR.include?(@curr_node)
|
95
|
+
@h[name] = value if @curr_node != :Modification
|
96
|
+
@mh.push(value) if @curr_node == :Modification
|
97
|
+
end
|
98
|
+
#
|
99
|
+
def text(value)
|
100
|
+
return unless ATTR.include?(@curr_node)
|
101
|
+
@h[@curr_node] = value
|
102
|
+
end
|
103
|
+
#
|
104
|
+
def end_element(name)
|
105
|
+
return unless name == :Peptide
|
106
|
+
@pbar.inc if !@pbar.nil?
|
107
|
+
@pep_h[@h[:id].to_sym] = @h[:PeptideSequence]
|
108
|
+
if !@mh.empty? then
|
109
|
+
@mod_h[@h[:id].to_sym] = {}
|
110
|
+
(1..@mh.size-1).step(2) do |i|
|
111
|
+
@mod_h[@h[:id].to_sym][@mh[i].to_i] = @mh[i-1]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
#
|
117
|
+
# handler for PeptideEvent elements
|
118
|
+
#
|
119
|
+
class PeptideEventHandler < Ox::Sax
|
120
|
+
ATTR_MAP = [:post, :pre, :start, :end, :peptide_ref, :dBSequence_ref, :id, :isDecoy]
|
121
|
+
ATTR = [:PeptideEvidence]
|
122
|
+
def initialize(dbseq_h, num_pepev=nil)
|
123
|
+
@dbseq_h = dbseq_h
|
124
|
+
@pep_ev_h = Hash.new
|
125
|
+
@pbar = num_pepev.nil? ? nil : ProgressBar.new("PepEv", num_pepev)
|
126
|
+
end
|
127
|
+
|
128
|
+
attr_accessor :pep_ev_h, :pbar
|
129
|
+
|
130
|
+
def start_element(name)
|
131
|
+
@h = {} if name == :PeptideEvidence
|
132
|
+
@curr_node = name
|
133
|
+
end
|
134
|
+
|
135
|
+
def attr(name, value)
|
136
|
+
return unless ATTR.include?(@curr_node)
|
137
|
+
@h[name] = value if ATTR_MAP.include?(name)
|
138
|
+
end
|
139
|
+
|
140
|
+
def end_element(name)
|
141
|
+
return unless name == :PeptideEvidence
|
142
|
+
@pbar.inc if !@pbar.nil?
|
143
|
+
@pep_ev_h[@h[:id].to_sym] =
|
144
|
+
PeptideEvidence.new(:db_seq_ref => @h[:dBSequence_ref].to_sym,
|
145
|
+
:pep_id => @h[:peptide_ref].to_sym,
|
146
|
+
:start_pos => @h[:start],
|
147
|
+
:end_pos => @h[:end],
|
148
|
+
:prot_id => @dbseq_h[@h[:dBSequence_ref].to_sym],
|
149
|
+
:is_decoy => @h[:isDecoy])
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
#
|
154
|
+
# handler for SpectrumIDItem elements
|
155
|
+
#
|
156
|
+
class SpectraIDHandler < Ox::Sax
|
157
|
+
ATTR = [:SpectrumIdentificationItem, :PeptideEvidenceRef]
|
158
|
+
SPEC_ATTR_MAP = [:peptide_ref, :id, :passThreshold]
|
159
|
+
SPEC_PROB_ATTR_MAP = [:accession, :value]
|
160
|
+
SPEC_PROB_ACC = "MS:1002052" # code for spec-prob
|
161
|
+
def initialize(dbseq_h, pep_h, pep_ev_h, block, num_spec=nil)
|
162
|
+
@yield_to = block
|
163
|
+
@dbseq_h = dbseq_h
|
164
|
+
@pep_h = pep_h
|
165
|
+
@pep_ev_h = pep_ev_h
|
166
|
+
@spec_h = Hash.new
|
167
|
+
@pbar = num_spec.nil? ? nil : ProgressBar.new("Spectra", num_spec)
|
168
|
+
end
|
169
|
+
|
170
|
+
attr_accessor :spec_h, :pbar
|
171
|
+
|
172
|
+
def start_element(name)
|
173
|
+
@h = {} if name == :SpectrumIdentificationItem
|
174
|
+
@curr_node = name
|
175
|
+
@h_param = nil if name == :cvParam
|
176
|
+
end
|
177
|
+
|
178
|
+
def attr(name, value)
|
179
|
+
return unless ATTR.include?(@curr_node) ||
|
180
|
+
(@curr_node == :cvParam && SPEC_PROB_ATTR_MAP.include?(name))
|
181
|
+
|
182
|
+
@h_param[name] = value if !@h_param.nil?
|
183
|
+
@h_param = {} if name == :accession && value == SPEC_PROB_ACC
|
184
|
+
if name == :peptideEvidence_ref then # if peptideEvidence, force into list
|
185
|
+
@h[name].nil? ? @h[name] = [value.to_sym] : @h[name].push(value.to_sym)
|
186
|
+
end
|
187
|
+
@h[name] = value.to_sym if SPEC_ATTR_MAP.include?(name)
|
188
|
+
@h[name] = value.split("_")[1].to_i if name == :id
|
189
|
+
end
|
190
|
+
|
191
|
+
def attrs_done()
|
192
|
+
return unless (!@h_param.nil? && !@h.nil?)
|
193
|
+
@h[:spec_prob] = @h_param[:value].to_f
|
194
|
+
end
|
195
|
+
|
196
|
+
def end_element(name)
|
197
|
+
return unless name == :SpectrumIdentificationItem
|
198
|
+
@yield_to.call(@h)
|
199
|
+
@pbar.inc if !@pbar.nil?
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def initialize(file, use_pbar = nil, tda_flag = true)
|
204
|
+
@use_pbar = use_pbar
|
205
|
+
@mzid_file = file
|
206
|
+
@tda_flag = tda_flag
|
207
|
+
#
|
208
|
+
# get counts
|
209
|
+
if @use_pbar then
|
210
|
+
count_handler = CounterHandler.new
|
211
|
+
File.open(@mzid_file){|f| Ox.sax_parse(count_handler, f)}
|
212
|
+
@num_spec = count_handler.spec_count
|
213
|
+
end
|
214
|
+
|
215
|
+
#puts "DBSeq:\t#{count_handler.dbseq_count}"
|
216
|
+
#puts "Peptides:\t#{count_handler.pep_count}"
|
217
|
+
#puts "PepEv:\t#{count_handler.pepev_count}"
|
218
|
+
#puts "Spectra:\t#{count_handler.spec_count}"
|
219
|
+
#
|
220
|
+
# cache DBSequence elements
|
221
|
+
dbseq_handler = DBSequenceHandler.new(@use_pbar.nil? ? nil : count_handler.dbseq_count)
|
222
|
+
File.open(@mzid_file){|f| Ox.sax_parse(dbseq_handler, f)}
|
223
|
+
dbseq_handler.pbar.finish if !dbseq_handler.pbar.nil?
|
224
|
+
@dbseq_h = dbseq_handler.dbseq_h
|
225
|
+
#
|
226
|
+
# cache Peptide elements
|
227
|
+
pep_handler = PeptideHandler.new(@use_pbar.nil? ? nil : count_handler.pep_count)
|
228
|
+
File.open(@mzid_file){|f| Ox.sax_parse(pep_handler, f)}
|
229
|
+
pep_handler.pbar.finish if !pep_handler.pbar.nil?
|
230
|
+
@pep_h = pep_handler.pep_h
|
231
|
+
@mod_h = pep_handler.mod_h
|
232
|
+
#
|
233
|
+
# create/cache PeptideEvent elements
|
234
|
+
pep_ev_handler = PeptideEventHandler.new(@dbseq_h, @use_pbar.nil? ? nil : count_handler.pepev_count)
|
235
|
+
File.open(@mzid_file){|f| Ox.sax_parse(pep_ev_handler, f)}
|
236
|
+
pep_ev_handler.pbar.finish if !pep_ev_handler.pbar.nil?
|
237
|
+
@pep_ev_h = pep_ev_handler.pep_ev_h
|
238
|
+
|
239
|
+
end
|
240
|
+
#
|
241
|
+
# write output to specified csv file
|
242
|
+
#
|
243
|
+
def write_to_csv(outfile="result.csv", show_mods=true)
|
244
|
+
CSV.open(outfile, "w", {:col_sep => "\t"}) do |csv|
|
245
|
+
headerAry = ["#spec_num", "peptide", "spec_prob", "decoy", "prot_ids", "start", "end", "num_prot"]
|
246
|
+
headerAry.push("mods") if show_mods
|
247
|
+
headerAry.delete("decoy") if !@tda_flag
|
248
|
+
csv << headerAry
|
249
|
+
|
250
|
+
proc = Proc.new do |spec_h|
|
251
|
+
# peptide reference/seq
|
252
|
+
pep_ref = spec_h[:peptide_ref].to_sym
|
253
|
+
pep_seq = @pep_h[pep_ref]
|
254
|
+
mods = @mod_h[pep_ref]
|
255
|
+
# peptide evidence list
|
256
|
+
pep_ev_ref_lst = spec_h[:peptideEvidence_ref]
|
257
|
+
# number of proteins with matching peptide
|
258
|
+
num_prot = pep_ev_ref_lst.size
|
259
|
+
# for each PeptideEvidence entry ...
|
260
|
+
pep_ev_ref_lst.each do |pep_ev_ref|
|
261
|
+
pep_ev = @pep_ev_h[pep_ev_ref]
|
262
|
+
# start/end pos within protein
|
263
|
+
start_pos = pep_ev.get_start_pos
|
264
|
+
end_pos = pep_ev.get_end_pos
|
265
|
+
# get protein ID
|
266
|
+
prot_id = pep_ev.get_prot_id
|
267
|
+
# get decoy flag
|
268
|
+
is_decoy = pep_ev.get_is_decoy
|
269
|
+
# write to file
|
270
|
+
ary = [spec_h[:id], pep_seq, spec_h[:spec_prob], is_decoy, prot_id, start_pos, end_pos, num_prot]
|
271
|
+
ary.delete_at(3) if !@tda_flag
|
272
|
+
if show_mods then
|
273
|
+
modstr = if !mods.nil? then
|
274
|
+
mods.keys.map{|loc| val = mods[loc].to_i; [loc, val > 0 ? "+#{val}" : "-#{val}"].join(";")}.join("|")
|
275
|
+
else
|
276
|
+
nil
|
277
|
+
end
|
278
|
+
ary.push(modstr)
|
279
|
+
end
|
280
|
+
csv << ary
|
281
|
+
end
|
282
|
+
|
283
|
+
end
|
284
|
+
spec_handler = SpectraIDHandler.new(@dbseq_h, @pep_h, @pep_ev_h, proc, @use_pbar.nil? ? nil : @num_spec)
|
285
|
+
File.open(@mzid_file){|f| Ox.sax_parse(spec_handler, f)}
|
286
|
+
spec_handler.pbar.finish if !spec_handler.pbar.nil?
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module MzID
|
2
|
+
#
|
3
|
+
# class to represent peptide evidence entries in file
|
4
|
+
#
|
5
|
+
class PeptideEvidence
|
6
|
+
def initialize(h={})
|
7
|
+
@id = h.has_key?(:id) ? h[:id] : nil
|
8
|
+
@db_seq_ref = h.has_key?(:db_seq_ref) ? h[:db_seq_ref] : nil
|
9
|
+
@pep_id = h.has_key?(:pep_id) ? h[:pep_id] : nil
|
10
|
+
@start_pos = h.has_key?(:start_pos) ? h[:start_pos] : nil
|
11
|
+
@end_pos = h.has_key?(:end_pos) ? h[:end_pos] : nil
|
12
|
+
@pre = h.has_key?(:pre) ? h[:pre] : nil
|
13
|
+
@post = h.has_key?(:post) ? h[:post] : nil
|
14
|
+
@is_decoy = h.has_key?(:is_decoy) ? h[:is_decoy] : nil
|
15
|
+
@prot_id = h.has_key?(:prot_id) ? h[:prot_id] : nil
|
16
|
+
end
|
17
|
+
#
|
18
|
+
# get methods
|
19
|
+
#
|
20
|
+
def get_id() @id end
|
21
|
+
def get_db_seq_ref() @db_seq_ref end
|
22
|
+
def get_pep_id() @pep_id end
|
23
|
+
def get_start_pos() @start_pos end
|
24
|
+
def get_end_pos() @end_pos end
|
25
|
+
def get_pre() @pre end
|
26
|
+
def get_post() @post end
|
27
|
+
def get_is_decoy() @is_decoy end
|
28
|
+
def get_prot_id() @prot_id end
|
29
|
+
#
|
30
|
+
# represent as string
|
31
|
+
#
|
32
|
+
def to_s()
|
33
|
+
"[#{@id}, #{@pep_id}; #{@start_pos}:#{@end_pos}, #{@pre}...#{@post}]"
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
end
|
data/lib/mzid/psm.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
|
2
|
+
module MzID
|
3
|
+
#
|
4
|
+
# class to represent a single peptide-spectrum match (PSM)
|
5
|
+
#
|
6
|
+
class PSM
|
7
|
+
|
8
|
+
def initialize(h={})
|
9
|
+
@id = h.has_key?(:id) ? h[:id] : nil
|
10
|
+
@pep = h.has_key?(:pep) ? h[:pep] : nil
|
11
|
+
@spec_prob = h.has_key?(:spec_prob) ? h[:spec_prob] : nil
|
12
|
+
@peptide_ref= h.has_key?(:pep_ref) ? h[:pep_ref] : nil
|
13
|
+
@spec_ref = h.has_key?(:spec_ref) ? h[:spec_ref] : nil
|
14
|
+
@spec_num = h.has_key?(:spec_num) ? h[:spec_num] : nil
|
15
|
+
@mods = h.has_key?(:mods) ? h[:mods] : nil
|
16
|
+
@pep_evidence = h.has_key?(:pep_ev) ? h[:pep_ev] : nil
|
17
|
+
@pass_thresh = h.has_key?(:pass_threshold) ? h[:pass_threshold] : nil
|
18
|
+
end
|
19
|
+
#
|
20
|
+
#--
|
21
|
+
# get methods
|
22
|
+
#++
|
23
|
+
#
|
24
|
+
# get ID
|
25
|
+
def get_id() @id end
|
26
|
+
# get peptide sequence
|
27
|
+
def get_pep() @pep end
|
28
|
+
# get spectral probability
|
29
|
+
def get_spec_prob() @spec_prob end
|
30
|
+
# get peptide reference
|
31
|
+
def get_pep_ref() @peptide_ref end
|
32
|
+
# get spectrum reference
|
33
|
+
def get_spec_ref() @spec_ref end
|
34
|
+
# get spectrum number
|
35
|
+
def get_spec_num() @spec_num end
|
36
|
+
# get modifications
|
37
|
+
def get_mods() @mods end
|
38
|
+
# get peptide evidence
|
39
|
+
def get_pep_ev() @pep_evidence end
|
40
|
+
# get pass threshold flag
|
41
|
+
def get_pass_threshold() @pass_thresh end
|
42
|
+
#
|
43
|
+
#--
|
44
|
+
# set methods
|
45
|
+
#++
|
46
|
+
#
|
47
|
+
# set the peptide sequence
|
48
|
+
def set_pep(pep) @pep = pep end
|
49
|
+
# set the spectral probability
|
50
|
+
def set_spec_prob(prob) @spec_prob = prob end
|
51
|
+
# set peptide
|
52
|
+
def set_pep(pep_seq) @pep = pep_seq end
|
53
|
+
# add the peptide evidence
|
54
|
+
def add_pep_ev(pep_ev) @pep_evidence.nil? ? @pep_evidence = [pep_ev] : @pep_evidence.push(pep_ev) end
|
55
|
+
#
|
56
|
+
# output PSM as string
|
57
|
+
#
|
58
|
+
def to_s() "[#{@spec_num}; Pep: #{@pep}; SpecProb: #{@spec_prob}; Mods #{@mods.to_s}]" end
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'progressbar'
|
3
|
+
require 'mzid/base_parser'
|
4
|
+
require 'mzid/batch_parser'
|
5
|
+
|
6
|
+
module MzID
|
7
|
+
#
|
8
|
+
# class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner
|
9
|
+
#
|
10
|
+
class StreamingParser < BatchParser
|
11
|
+
|
12
|
+
def initialize(file, use_pbar = nil)
|
13
|
+
@use_pbar = use_pbar
|
14
|
+
@num_spec = 0
|
15
|
+
super(file)
|
16
|
+
end
|
17
|
+
#
|
18
|
+
# first pass thru file just counting element types
|
19
|
+
#
|
20
|
+
def get_num_elements(use_pbar=@use_pbar)
|
21
|
+
num_pep = 0
|
22
|
+
num_db_seq = 0
|
23
|
+
num_pep_ev = 0
|
24
|
+
# once through file to count
|
25
|
+
tmp_reader = Nokogiri::XML::Reader(File.open(@mzid_file))
|
26
|
+
tmp_reader.each do |node|
|
27
|
+
@num_spec += 1 if node.name == "SpectrumIdentificationResult"
|
28
|
+
num_pep += 1 if node.name == "Peptide"
|
29
|
+
num_db_seq += 1 if node.name == "DBSequence"
|
30
|
+
num_pep_ev += 1 if node.name == "PeptideEvidence"
|
31
|
+
end
|
32
|
+
[num_pep, num_db_seq, num_pep_ev]
|
33
|
+
end
|
34
|
+
#
|
35
|
+
# store peptide sequences in hash for lookup
|
36
|
+
#
|
37
|
+
def cache_ids(use_pbar = @use_pbar)
|
38
|
+
# num_pep = 0
|
39
|
+
# num_db_seq = 0
|
40
|
+
# num_pep_ev = 0
|
41
|
+
# # once through file to count
|
42
|
+
# tmp_reader = Nokogiri::XML::Reader(File.open(@mzid_file))
|
43
|
+
# tmp_reader.each do |node|
|
44
|
+
# @num_spec += 1 if node.name == "SpectrumIdentificationResult"
|
45
|
+
# num_pep += 1 if node.name == "Peptide"
|
46
|
+
# num_db_seq += 1 if node.name == "DBSequence"
|
47
|
+
# num_pep_ev += 1 if node.name == "PeptideEvidence"
|
48
|
+
# end
|
49
|
+
num_pep, num_db_seq, num_pep_ev = get_num_elements(nil)
|
50
|
+
# puts "SPEC:\t#{@num_spec}"
|
51
|
+
# puts "PEP:\t#{num_pep}"
|
52
|
+
# puts "DB:\t#{num_db_seq}"
|
53
|
+
# puts "PEPEV:\t#{num_pep_ev}"
|
54
|
+
|
55
|
+
@pep_h = Hash.new
|
56
|
+
@mod_h = Hash.new
|
57
|
+
pbar = ProgressBar.new("Caching", num_pep+num_db_seq+num_pep_ev) if use_pbar
|
58
|
+
reader = Nokogiri::XML::Reader(File.open(@mzid_file))
|
59
|
+
reader.each do |node|
|
60
|
+
# @num_spec += 1 if node.name == "SpectrumIdentificationResult"
|
61
|
+
|
62
|
+
if node.name == "Peptide" then
|
63
|
+
# parse local peptide entry
|
64
|
+
tmp_node = Nokogiri::XML.parse(node.outer_xml)
|
65
|
+
tmp_node.remove_namespaces!
|
66
|
+
root = tmp_node.root
|
67
|
+
|
68
|
+
pep_id = root["id"]
|
69
|
+
# skip if already handled PepID
|
70
|
+
next if @pep_h.has_key?(pep_id)
|
71
|
+
# parse sequence/mods if haven't seen it yet
|
72
|
+
pep_seq = get_peptide_sequence(root)
|
73
|
+
mod_line = get_modifications(root)
|
74
|
+
@pep_h[pep_id] = pep_seq
|
75
|
+
@mod_h[pep_id] = mod_line
|
76
|
+
pbar.inc if use_pbar
|
77
|
+
end
|
78
|
+
#
|
79
|
+
if node.name == "DBSequence" then
|
80
|
+
# parse local DBSequence entry
|
81
|
+
tmp_node = Nokogiri::XML.parse(node.outer_xml)
|
82
|
+
tmp_node.remove_namespaces!
|
83
|
+
root = tmp_node.root
|
84
|
+
cache_db_seq_entries(root)
|
85
|
+
pbar.inc if use_pbar
|
86
|
+
end
|
87
|
+
#
|
88
|
+
if node.name == "PeptideEvidence" then
|
89
|
+
# parse local DBSequence entry
|
90
|
+
tmp_node = Nokogiri::XML.parse(node.outer_xml)
|
91
|
+
tmp_node.remove_namespaces!
|
92
|
+
root = tmp_node.root
|
93
|
+
cache_pep_ev(root)
|
94
|
+
pbar.inc if use_pbar
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
pbar.finish if use_pbar
|
99
|
+
end
|
100
|
+
#
|
101
|
+
# store peptide evidence sequences in hash for lookup
|
102
|
+
#
|
103
|
+
def cache_pep_ev(root)
|
104
|
+
pep_ev_lst = root.xpath('//PeptideEvidence')
|
105
|
+
pep_ev_lst.each do |pnode|
|
106
|
+
id = pnode["id"]
|
107
|
+
@pep_ev_h[id] =
|
108
|
+
PeptideEvidence.new(#:id => pnode["id"],
|
109
|
+
:db_seq_ref => pnode["dBSequence_ref"],
|
110
|
+
#:pep_id => pnode["peptide_ref"],
|
111
|
+
:start_pos => pnode["start"].to_i,
|
112
|
+
:end_pos => pnode["end"].to_i,
|
113
|
+
#:pre => pnode["pre"],
|
114
|
+
#:post => pnode["post"],
|
115
|
+
:prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
|
116
|
+
# @pep_ev_h[id] = pnode["dBSequence_ref"]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
#
|
120
|
+
# iterate through each psm
|
121
|
+
#
|
122
|
+
def each_psm(use_pbar=@use_pbar)
|
123
|
+
reader = Nokogiri::XML::Reader(File.open(@mzid_file))
|
124
|
+
pbar = ProgressBar.new("PSMs", @num_spec) if use_pbar
|
125
|
+
reader.each do |node|
|
126
|
+
next if node.name != "SpectrumIdentificationResult"
|
127
|
+
# parse local spec result entry
|
128
|
+
tmp_node = Nokogiri::XML.parse(node.outer_xml)
|
129
|
+
tmp_node.remove_namespaces!
|
130
|
+
root = tmp_node.root
|
131
|
+
# parse spectrum id item
|
132
|
+
psms_of_spec = root.xpath('.//SpectrumIdentificationItem')
|
133
|
+
psms_of_spec.each do |psm_node|
|
134
|
+
# get PSM object
|
135
|
+
psm = get_psm(psm_node)
|
136
|
+
# yield psm object
|
137
|
+
yield psm
|
138
|
+
end
|
139
|
+
pbar.inc if use_pbar
|
140
|
+
end
|
141
|
+
pbar.finish if use_pbar
|
142
|
+
end
|
143
|
+
#
|
144
|
+
# given a xml node of a psm, return the PSM
|
145
|
+
#
|
146
|
+
def get_psm(psm_node)
|
147
|
+
# get peptide evidence list
|
148
|
+
pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
|
149
|
+
pep_ev_lst = pep_ev_raw_lst.map do |penode|
|
150
|
+
pep_ev_ref_id = penode["peptideEvidence_ref"]
|
151
|
+
#@db_seq_h[@pep_ev_h[pep_ev_ref_id]] # if use simpler hash of prot ID
|
152
|
+
@pep_ev_h[pep_ev_ref_id] # if use PeptideEvidence object
|
153
|
+
end
|
154
|
+
# get cvparams
|
155
|
+
cvlst = psm_node.xpath('.//cvParam')
|
156
|
+
# find spectral prob
|
157
|
+
tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
|
158
|
+
spec_prob = tmp_lst[0]['value']
|
159
|
+
# get peptide
|
160
|
+
pep_seq = @pep_h[psm_node['peptide_ref']]
|
161
|
+
# get spectrum id/ref number
|
162
|
+
spec_id = psm_node['id']
|
163
|
+
spec_num = spec_id.split("_")[1].to_i
|
164
|
+
spec_ref = spec_id.split("_")[-1].to_i
|
165
|
+
# store in object
|
166
|
+
psm = PSM.new(:spec_num => spec_num,
|
167
|
+
:spec_ref => spec_ref,
|
168
|
+
:pep => pep_seq,
|
169
|
+
:spec_prob => spec_prob.to_f,
|
170
|
+
:mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
|
171
|
+
:pep_ev => pep_ev_lst)
|
172
|
+
end
|
173
|
+
|
174
|
+
private :get_psm, :get_num_elements
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|