mzid 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'load_helper'
4
+ require 'mzid'
5
+ require 'csv'
6
+ require 'progressbar'
7
+ require 'optparse'
8
+
9
+
10
+ options = {}
11
+ optparse = OptionParser.new do |opt|
12
+ opt.banner = "Usage: results.mzid [OPTIONS]"
13
+ opt.separator ""
14
+ opt.separator "Options"
15
+
16
+ options[:verbose] = false
17
+ opt.on("-v", "--verbose", "flag for verbose output or silent output") do |verbose|
18
+ options[:verbose] = verbose
19
+ end
20
+
21
+ options[:mods] = false
22
+ opt.on("-m", "--mods", "flag if the search contained modifications") do |ptm|
23
+ options[:mods] = ptm
24
+ end
25
+
26
+ opt.on("-o","--output FILE","output file name, if unspecified will create a results.csv file") do |outFile|
27
+ options[:output] = outFile
28
+ end
29
+
30
+ opt.on("-h","--help","help") do
31
+ puts optparse
32
+ Process.exit(0)
33
+ end
34
+ end
35
+ optparse.parse!
36
+ #
37
+ # basic checking
38
+ #
39
+ if options.size == 0 || ARGV.size != 1 then
40
+ puts optparse
41
+ Process.exit(0)
42
+ end
43
+ #
44
+ # setup params
45
+ #
46
+ result_mzid_file = ARGV[0]
47
+ tda_flag = true
48
+ outfile = options.has_key?(:output) ? options[:output] : (result_mzid_file.split(".mzid")[0] + ".csv")
49
+ #
50
+ # parse file and output
51
+ #
52
+ parser = MzID::ParserSax.new(result_mzid_file, (!options[:verbose] ? nil : true), tda_flag)
53
+ parser.write_to_csv(outfile, options[:mods])
@@ -0,0 +1,2 @@
1
+ ## if/when create a gem, comment this out
2
+ $LOAD_PATH.unshift("#{File.dirname(__FILE__)}/../lib/")
@@ -0,0 +1,14 @@
1
+
2
+ require 'mzid/psm'
3
+
4
+ require 'mzid/base_parser'
5
+ require 'mzid/batch_parser'
6
+ require 'mzid/streaming_parser'
7
+ require 'mzid/streaming_parser_lines'
8
+ require 'mzid/parser_sax'
9
+
10
+ module MzID
11
+
12
+ Parser = BatchParser
13
+
14
+ end
@@ -0,0 +1,45 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+
4
+ module MzID
5
+ #
6
+ # class to parse an mzIdentML file
7
+ #
8
+ class BaseParser
9
+
10
+ def initialize(file)
11
+ @mzid_file = file
12
+ end
13
+ #
14
+ # given an XML.parse output from the peptide block, extract peptide sequence
15
+ #
16
+ def get_peptide_sequence(pnode)
17
+ plst = pnode.xpath('.//PeptideSequence')
18
+ id = pnode['id']
19
+ seq = plst[0].content
20
+ end
21
+ #
22
+ # given an XML.parse output from the peptide block, extract modifications
23
+ #
24
+ def get_modifications(pep_node)
25
+ mods = pep_node.xpath('.//Modification')
26
+ id = pep_node['id']
27
+ mod_h = Hash.new
28
+ # parse any modifications
29
+ mods.each do |mod|
30
+ loc = mod['location'].to_i-1
31
+ delta_mass = mod['monoisotopicMassDelta'].to_f
32
+ if !mod_h.empty? then
33
+ mod_h.merge!( loc => delta_mass )
34
+ else
35
+ mod_h = {mod['location'].to_i-1 => delta_mass}
36
+ end
37
+ end
38
+ mod_h.empty? ? nil : mod_h
39
+ end
40
+
41
+ private :get_peptide_sequence, :get_modifications
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,148 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+ require 'mzid/base_parser'
4
+ require 'mzid/peptide_evidence'
5
+
6
+ module MzID
7
+ #
8
+ # class to parse an mzIdentML file
9
+ #
10
+ class BatchParser < BaseParser
11
+
12
+ def initialize(file)
13
+ super(file)
14
+ @pep_ev_h = Hash.new
15
+ @db_seq_h = Hash.new
16
+ cache_ids
17
+ end
18
+ #
19
+ # store peptide sequences in hash for lookup
20
+ #
21
+ def cache_ids()
22
+ hit_values = File.open(@mzid_file) do |io|
23
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
24
+ doc.remove_namespaces!
25
+ root = doc.root
26
+
27
+ cache_db_seq_entries(root)
28
+ cache_pep_ev(root)
29
+
30
+ peptide_lst = root.xpath('//Peptide')
31
+ @pep_h = Hash.new
32
+ @mod_h = Hash.new
33
+ peptide_lst.each do |pnode|
34
+
35
+ pep_id = pnode['id']
36
+ pep_seq = get_peptide_sequence(pnode)
37
+ mod_line = get_modifications(pnode)
38
+ @pep_h[pep_id] = pep_seq
39
+ @mod_h[pep_id] = mod_line
40
+ end
41
+
42
+ end
43
+ end
44
+ #
45
+ # store peptide evidence sequences in hash for lookup
46
+ #
47
+ def cache_pep_ev(root)
48
+ pep_ev_lst = root.xpath('//PeptideEvidence')
49
+ pep_ev_lst.each do |pnode|
50
+ id = pnode["id"]
51
+
52
+ @pep_ev_h[id] =
53
+ PeptideEvidence.new(:id => pnode["id"],
54
+ :db_seq_ref => pnode["dBSequence_ref"],
55
+ :pep_id => pnode["peptide_ref"],
56
+ :start_pos => pnode["start"].to_i,
57
+ :end_pos => pnode["end"].to_i,
58
+ :pre => pnode["pre"],
59
+ :post => pnode["post"],
60
+ :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
61
+ end
62
+ end
63
+ #
64
+ # store database sequence entries (ids)
65
+ #
66
+ def cache_db_seq_entries(root)
67
+ dbseq_lst = root.xpath('//DBSequence')
68
+ dbseq_lst.each do |dnode|
69
+ id = dnode["id"]
70
+ acc_id = dnode["accession"]
71
+ @db_seq_h[id] = acc_id
72
+ end
73
+ end
74
+ #
75
+ # iterate through each psm
76
+ #
77
+ def each_psm(use_pbar=nil)
78
+ hit_values = File.open(@mzid_file) do |io|
79
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
80
+ doc.remove_namespaces!
81
+ root = doc.root
82
+ # get list of identifications
83
+ spec_results = root.xpath('//SpectrumIdentificationResult')
84
+ pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar
85
+ spec_results.each do |sres|
86
+ #
87
+ psms_of_spec = sres.xpath('.//SpectrumIdentificationItem')
88
+ # go over each PSM from the spectra
89
+ psms_of_spec.each do |psm_node|
90
+ # get peptide evidence list
91
+ pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
92
+ pep_ev_lst = pep_ev_raw_lst.map do |penode|
93
+ pep_ev_ref_id = penode["peptideEvidence_ref"]
94
+ @pep_ev_h[pep_ev_ref_id]
95
+ end
96
+ # get cvparams
97
+ cvlst = psm_node.xpath('.//cvParam')
98
+ # find spectral prob
99
+ tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
100
+ spec_prob = tmp_lst[0]['value']
101
+ # get peptide
102
+ pep_seq = @pep_h[psm_node['peptide_ref']]
103
+ # get spectrum id/ref number
104
+ spec_id = psm_node['id']
105
+ spec_num = spec_id.split("_")[1].to_i
106
+ spec_ref = spec_id.split("_")[-1].to_i
107
+ # store in object
108
+ psm = PSM.new(:spec_num => spec_num,
109
+ :spec_ref => spec_ref,
110
+ :pep => pep_seq,
111
+ :spec_prob => spec_prob.to_f,
112
+ :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
113
+ :pep_ev => pep_ev_lst
114
+ )
115
+ # yield psm object
116
+ yield psm
117
+ end
118
+ pbar.inc if use_pbar
119
+ end
120
+ pbar.finish if use_pbar
121
+ end
122
+ end
123
+ #
124
+ # for each spectrum, return a list of PSM objects for that spectrum
125
+ #
126
+ def each_spectrum(use_pbar=nil)
127
+ spec_lst = []
128
+ self.each_psm(use_pbar) do |psm|
129
+ if spec_lst.empty? then
130
+ spec_lst.push(psm)
131
+ else
132
+ if spec_lst[-1].get_spec_num == psm.get_spec_num then
133
+ spec_lst.push(psm)
134
+ else # found new spec num, yield psm list
135
+ yield spec_lst
136
+ spec_lst = [psm] # add new to list
137
+ end
138
+ end
139
+ end
140
+ yield spec_lst
141
+ end
142
+
143
+
144
+ private :cache_ids
145
+
146
+ end
147
+
148
+ end
@@ -0,0 +1,257 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+ require 'mzid/base_parser'
4
+ require 'mzid/streaming_parser'
5
+
6
+ module MzID
7
+ #
8
+ # class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner
9
+ # performs multi-pass filtering so that can maintain smallest datastruct in memory
10
+ # 1) first collect counts of elements
11
+ # 2) get list of peptide evidence from PSMs that pass filter
12
+ # 3)
13
+ #
14
+ class FilteredStreamingParser < StreamingParser
15
+
16
+ def initialize(file, sp_thresh = 10.0**-10, use_pbar = nil)
17
+ @num_spec = 0
18
+ #
19
+ @pep_ev_h_protID = Hash.new
20
+ @pep_ev_h_startPos = Hash.new
21
+ @pep_ev_h_endPos = Hash.new
22
+ @pep_ev_h_dbseqRef = Hash.new
23
+ super(file, use_pbar)
24
+ end
25
+ #
26
+ #
27
+ def cache_ids2(use_pbar = @use_pbar)
28
+ end
29
+
30
+ #def get_pep_ev_protID(pid) @pep_ev_h_protID[pid] end
31
+
32
+ def get_prot_id(pep_ev_id)
33
+ dbref = @pep_ev_h_dbseqRef[pep_ev_id]
34
+ prot_id = @db_seq_h[dbref]
35
+ prot_id
36
+ end
37
+
38
+ attr_accessor :pep_ev_h_dbseqRef
39
+
40
+ #
41
+ # store peptide sequences in hash for lookup
42
+ #
43
+ def cache_ids(use_pbar = @use_pbar)
44
+ num_pep, num_db_seq, num_pep_ev = get_num_elements(nil)
45
+ puts "SPEC:\t#{@num_spec}"
46
+ puts "PEP:\t#{num_pep}"
47
+ puts "DB:\t#{num_db_seq}"
48
+ puts "PEPEV:\t#{num_pep_ev}"
49
+
50
+ #pbar1 = ProgressBar.new("Caching psm", num_pep) if use_pbar
51
+ #reader = Nokogiri::XML::Reader(File.open(@mzid_file))
52
+ #reader.each do |node|
53
+ #end
54
+
55
+ @pep_h = Hash.new
56
+ @mod_h = Hash.new
57
+ #pbar = ProgressBar.new("Caching", num_pep+num_db_seq+num_pep_ev) if use_pbar
58
+ pbar1 = ProgressBar.new("peptides", num_pep/2) if use_pbar
59
+ reader = Nokogiri::XML::Reader(File.open(@mzid_file))
60
+ reader.each do |node|
61
+ #
62
+ if node.name == "Peptide" then
63
+ #pbar.inc if use_pbar
64
+ # parse local peptide entry
65
+ tmp_node = Nokogiri::XML.parse(node.outer_xml)
66
+ tmp_node.remove_namespaces!
67
+ root = tmp_node.root
68
+ pep_id = root["id"].to_sym
69
+ # skip if already handled PepID
70
+ next if @pep_h.has_key?(pep_id)
71
+ # parse sequence/mods if haven't seen it yet
72
+ pep_seq = get_peptide_sequence(root)
73
+ mod_line = get_modifications(root)
74
+ @pep_h[pep_id] = pep_seq
75
+ @mod_h[pep_id] = mod_line
76
+ pbar1.inc if use_pbar
77
+ end
78
+ end
79
+ pbar1.finish if use_pbar
80
+ #
81
+ pbar2 = ProgressBar.new("db_seq", num_db_seq) if use_pbar
82
+ IO.foreach(@mzid_file) do |line|
83
+ next if !line.match(/^\s+<DBSequence\s/)
84
+
85
+ prot_id = line.match(/accession=\"([\w|\|]+)/)[1]
86
+ db_id = line.match(/id=\"(\w+)/)[1]
87
+
88
+ @db_seq_h[db_id.to_sym] = prot_id.to_sym
89
+ pbar2.inc if use_pbar
90
+ end
91
+ # reader2 = Nokogiri::XML::Reader(File.open(@mzid_file))
92
+ # reader2.each do |node|
93
+ # #
94
+ # if node.name == "DBSequence" then
95
+ # # parse local DBSequence entry
96
+ # tmp_node = Nokogiri::XML.parse(node.outer_xml)
97
+ # tmp_node.remove_namespaces!
98
+ # root = tmp_node.root
99
+ # cache_db_seq_entries(root)
100
+ # pbar2.inc if use_pbar
101
+ # end
102
+ # end
103
+ pbar2.finish if use_pbar
104
+ #
105
+ pbar3 = ProgressBar.new("pep_ev", num_pep_ev) if use_pbar
106
+ IO.foreach(@mzid_file) do |line|
107
+ next if !line.match(/^\s+<PeptideEvidence\s/)
108
+
109
+ db_id = line.match(/dBSequence_ref=\"(\w+)/)[1]
110
+ pep_ev = line.match(/id=\"(\w+)/)[1]
111
+ @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym
112
+ pbar3.inc if use_pbar
113
+ end
114
+ # reader3 = Nokogiri::XML::Reader(File.open(@mzid_file))
115
+ # reader3.each do |node|
116
+ # if node.name == "PeptideEvidence" then
117
+ # # parse local DBSequence entry
118
+ # tmp_node = Nokogiri::XML.parse(node.outer_xml)
119
+ # tmp_node.remove_namespaces!
120
+ # root = tmp_node.root
121
+ # cache_pep_ev(root)
122
+ # pbar3.inc if use_pbar
123
+ # end
124
+ # # if node.name == "PeptideEvidence" then
125
+ # # tmp_node = Nokogiri::XML.parse(node.outer_xml)
126
+ # # root = tmp_node.root
127
+ # # pep_ref = root.to_s.match(/peptide_ref=\"(\w+)\"/)[1]
128
+ # # id_ref = root.to_s.match(/id=\"(\w+)\"/)[1]
129
+ # # db_ref = root.to_s.match(/dBSequence_ref=\"(\w+)\"/)[1]
130
+ # # @pep_ev_h_dbseqRef[id_ref.to_sym] = db_ref.to_sym
131
+ # # end
132
+ # end
133
+ pbar3.finish if use_pbar
134
+ puts "PEP_H SIZE:\t#{@pep_h.size}"
135
+ puts "DBSEQ_H SIZE:\t#{@db_seq_h.size}"
136
+ puts "PEP_EV_H SIZE:\t#{@pep_ev_h_dbseqRef.size}"
137
+ end
138
+ #
139
+ # store database sequence entries (ids)
140
+ #
141
+ def cache_db_seq_entries(root)
142
+ dbseq_lst = root.xpath('//DBSequence')
143
+ dbseq_lst.each do |dnode|
144
+ id = dnode["id"].to_sym
145
+ acc_id = dnode["accession"]
146
+ @db_seq_h[id] = acc_id.to_sym
147
+ end
148
+ end
149
+ #
150
+ # store peptide evidence sequences in hash for lookup
151
+ #
152
+ def cache_pep_ev(root)
153
+ pep_ev_lst = root.xpath('//PeptideEvidence')
154
+ pep_ev_lst.each do |pnode|
155
+ id = pnode["id"].to_sym
156
+ # @pep_ev_h[id] =
157
+ # PeptideEvidence.new(#:id => pnode["id"],
158
+ # :db_seq_ref => pnode["dBSequence_ref"],
159
+ # #:pep_id => pnode["peptide_ref"],
160
+ # :start_pos => pnode["start"].to_i,
161
+ # :end_pos => pnode["end"].to_i,
162
+ # #:pre => pnode["pre"],
163
+ # #:post => pnode["post"],
164
+ # :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
165
+
166
+ # @pep_ev_h_protID[id.to_sym] = @db_seq_h[pnode["dBSequence_ref"]].to_sym
167
+ # @pep_ev_h_startPos[id.to_sym] = pnode["start"].to_i,
168
+ # @pep_ev_h_endPos[id.to_sym] = pnode["end"].to_i
169
+ @pep_ev_h_dbseqRef[id.to_sym] = pnode["dBSequence_ref"].to_sym
170
+ end
171
+ end
172
+ #
173
+ # iterate through each psm
174
+ #
175
+ def each_psm(use_pbar=@use_pbar)
176
+ hit_values = File.open(@mzid_file) do |io|
177
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
178
+ doc.remove_namespaces!
179
+ root = doc.root
180
+ # get list of identifications
181
+ spec_results = root.xpath('//SpectrumIdentificationResult')
182
+ pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar
183
+ spec_results.each do |sres|
184
+ #
185
+ psms_of_spec = sres.xpath('.//SpectrumIdentificationItem')
186
+ # go over each PSM from the spectra
187
+ psms_of_spec.each do |psm_node|
188
+ psm = get_psm(psm_node)
189
+ # yield psm object
190
+ yield psm
191
+ end
192
+ pbar.inc if use_pbar
193
+ end
194
+ pbar.finish if use_pbar
195
+ end
196
+ end
197
+ #
198
+ # given a xml node of a psm, return the PSM
199
+ #
200
+ def get_psm(psm_node)
201
+ # get peptide evidence list
202
+ pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
203
+ pep_ev_lst = pep_ev_raw_lst.map{|penode| pep_ev_ref_id = penode["peptideEvidence_ref"].to_sym}
204
+ # get cvparams
205
+ cvlst = psm_node.xpath('.//cvParam')
206
+ # find spectral prob
207
+ tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
208
+ spec_prob = tmp_lst[0]['value']
209
+ # get peptide
210
+ pep_seq = @pep_h[psm_node['peptide_ref'].to_sym]
211
+ # get spectrum id/ref number
212
+ spec_id = psm_node['id']
213
+ spec_num = spec_id.split("_")[1].to_i
214
+ spec_ref = spec_id.split("_")[-1].to_i
215
+ #
216
+ # store in object
217
+ psm = PSM.new(:spec_num => spec_num,
218
+ :spec_ref => spec_ref,
219
+ :pep => pep_seq,
220
+ :spec_prob => spec_prob.to_f,
221
+ :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
222
+ :pep_ev => pep_ev_lst)
223
+ end
224
+ #
225
+ # load PSMs into memory, and go back to perform lookup for prot ids
226
+ #
227
+ def write_to_file(outfile, use_pbar=@use_pbar)
228
+
229
+ pbar3 = ProgressBar.new("Caching pep_ev", num_db_seq) if use_pbar
230
+ t1_db = Time.now
231
+ reader3 = Nokogiri::XML::Reader(File.open(@mzid_file))
232
+ reader3.each do |node|
233
+ if node.name == "PeptideEvidence" then
234
+ # parse local DBSequence entry
235
+ tmp_node = Nokogiri::XML.parse(node.outer_xml)
236
+ tmp_node.remove_namespaces!
237
+ root = tmp_node.root
238
+ #cache_pep_ev(root)
239
+ pep_ev_lst = root.xpath('//PeptideEvidence')
240
+ pep_ev_lst.each do |pnode|
241
+ id = pnode["id"]
242
+ start_pos = pnode["start"].to_i,
243
+ end_pos = pnode["end"].to_i
244
+ db_seq_ref = pnode["dBSequence_ref"].to_sym
245
+ end
246
+ pbar3.inc if use_pbar
247
+ end
248
+
249
+ end
250
+ pbar3.finish if use_pbar
251
+
252
+ end
253
+
254
+
255
+ end
256
+
257
+ end