mzid 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'load_helper'
4
+ require 'mzid'
5
+ require 'csv'
6
+ require 'progressbar'
7
+ require 'optparse'
8
+
9
+
10
+ options = {}
11
+ optparse = OptionParser.new do |opt|
12
+ opt.banner = "Usage: results.mzid [OPTIONS]"
13
+ opt.separator ""
14
+ opt.separator "Options"
15
+
16
+ options[:verbose] = false
17
+ opt.on("-v", "--verbose", "flag for verbose output or silent output") do |verbose|
18
+ options[:verbose] = verbose
19
+ end
20
+
21
+ options[:mods] = false
22
+ opt.on("-m", "--mods", "flag if the search contained modifications") do |ptm|
23
+ options[:mods] = ptm
24
+ end
25
+
26
+ opt.on("-o","--output FILE","output file name, if unspecified will create a results.csv file") do |outFile|
27
+ options[:output] = outFile
28
+ end
29
+
30
+ opt.on("-h","--help","help") do
31
+ puts optparse
32
+ Process.exit(0)
33
+ end
34
+ end
35
+ optparse.parse!
36
+ #
37
+ # basic checking
38
+ #
39
+ if options.size == 0 || ARGV.size != 1 then
40
+ puts optparse
41
+ Process.exit(0)
42
+ end
43
+ #
44
+ # setup params
45
+ #
46
+ result_mzid_file = ARGV[0]
47
+ tda_flag = true
48
+ outfile = options.has_key?(:output) ? options[:output] : (result_mzid_file.split(".mzid")[0] + ".csv")
49
+ #
50
+ # parse file and output
51
+ #
52
+ parser = MzID::ParserSax.new(result_mzid_file, (!options[:verbose] ? nil : true), tda_flag)
53
+ parser.write_to_csv(outfile, options[:mods])
@@ -0,0 +1,2 @@
1
+ ## if/when create a gem, comment this out
2
+ $LOAD_PATH.unshift("#{File.dirname(__FILE__)}/../lib/")
@@ -0,0 +1,14 @@
1
+
2
+ require 'mzid/psm'
3
+
4
+ require 'mzid/base_parser'
5
+ require 'mzid/batch_parser'
6
+ require 'mzid/streaming_parser'
7
+ require 'mzid/streaming_parser_lines'
8
+ require 'mzid/parser_sax'
9
+
10
+ module MzID
11
+
12
+ Parser = BatchParser
13
+
14
+ end
@@ -0,0 +1,45 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+
4
+ module MzID
5
+ #
6
+ # class to parse an mzIdentML file
7
+ #
8
+ class BaseParser
9
+
10
+ def initialize(file)
11
+ @mzid_file = file
12
+ end
13
+ #
14
+ # given an XML.parse output from the peptide block, extract peptide sequence
15
+ #
16
+ def get_peptide_sequence(pnode)
17
+ plst = pnode.xpath('.//PeptideSequence')
18
+ id = pnode['id']
19
+ seq = plst[0].content
20
+ end
21
+ #
22
+ # given an XML.parse output from the peptide block, extract modifications
23
+ #
24
+ def get_modifications(pep_node)
25
+ mods = pep_node.xpath('.//Modification')
26
+ id = pep_node['id']
27
+ mod_h = Hash.new
28
+ # parse any modifications
29
+ mods.each do |mod|
30
+ loc = mod['location'].to_i-1
31
+ delta_mass = mod['monoisotopicMassDelta'].to_f
32
+ if !mod_h.empty? then
33
+ mod_h.merge!( loc => delta_mass )
34
+ else
35
+ mod_h = {mod['location'].to_i-1 => delta_mass}
36
+ end
37
+ end
38
+ mod_h.empty? ? nil : mod_h
39
+ end
40
+
41
+ private :get_peptide_sequence, :get_modifications
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,148 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+ require 'mzid/base_parser'
4
+ require 'mzid/peptide_evidence'
5
+
6
+ module MzID
7
+ #
8
+ # class to parse an mzIdentML file
9
+ #
10
+ class BatchParser < BaseParser
11
+
12
+ def initialize(file)
13
+ super(file)
14
+ @pep_ev_h = Hash.new
15
+ @db_seq_h = Hash.new
16
+ cache_ids
17
+ end
18
+ #
19
+ # store peptide sequences in hash for lookup
20
+ #
21
+ def cache_ids()
22
+ hit_values = File.open(@mzid_file) do |io|
23
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
24
+ doc.remove_namespaces!
25
+ root = doc.root
26
+
27
+ cache_db_seq_entries(root)
28
+ cache_pep_ev(root)
29
+
30
+ peptide_lst = root.xpath('//Peptide')
31
+ @pep_h = Hash.new
32
+ @mod_h = Hash.new
33
+ peptide_lst.each do |pnode|
34
+
35
+ pep_id = pnode['id']
36
+ pep_seq = get_peptide_sequence(pnode)
37
+ mod_line = get_modifications(pnode)
38
+ @pep_h[pep_id] = pep_seq
39
+ @mod_h[pep_id] = mod_line
40
+ end
41
+
42
+ end
43
+ end
44
+ #
45
+ # store peptide evidence sequences in hash for lookup
46
+ #
47
+ def cache_pep_ev(root)
48
+ pep_ev_lst = root.xpath('//PeptideEvidence')
49
+ pep_ev_lst.each do |pnode|
50
+ id = pnode["id"]
51
+
52
+ @pep_ev_h[id] =
53
+ PeptideEvidence.new(:id => pnode["id"],
54
+ :db_seq_ref => pnode["dBSequence_ref"],
55
+ :pep_id => pnode["peptide_ref"],
56
+ :start_pos => pnode["start"].to_i,
57
+ :end_pos => pnode["end"].to_i,
58
+ :pre => pnode["pre"],
59
+ :post => pnode["post"],
60
+ :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
61
+ end
62
+ end
63
+ #
64
+ # store database sequence entries (ids)
65
+ #
66
+ def cache_db_seq_entries(root)
67
+ dbseq_lst = root.xpath('//DBSequence')
68
+ dbseq_lst.each do |dnode|
69
+ id = dnode["id"]
70
+ acc_id = dnode["accession"]
71
+ @db_seq_h[id] = acc_id
72
+ end
73
+ end
74
+ #
75
+ # iterate through each psm
76
+ #
77
+ def each_psm(use_pbar=nil)
78
+ hit_values = File.open(@mzid_file) do |io|
79
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
80
+ doc.remove_namespaces!
81
+ root = doc.root
82
+ # get list of identifications
83
+ spec_results = root.xpath('//SpectrumIdentificationResult')
84
+ pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar
85
+ spec_results.each do |sres|
86
+ #
87
+ psms_of_spec = sres.xpath('.//SpectrumIdentificationItem')
88
+ # go over each PSM from the spectra
89
+ psms_of_spec.each do |psm_node|
90
+ # get peptide evidence list
91
+ pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
92
+ pep_ev_lst = pep_ev_raw_lst.map do |penode|
93
+ pep_ev_ref_id = penode["peptideEvidence_ref"]
94
+ @pep_ev_h[pep_ev_ref_id]
95
+ end
96
+ # get cvparams
97
+ cvlst = psm_node.xpath('.//cvParam')
98
+ # find spectral prob
99
+ tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
100
+ spec_prob = tmp_lst[0]['value']
101
+ # get peptide
102
+ pep_seq = @pep_h[psm_node['peptide_ref']]
103
+ # get spectrum id/ref number
104
+ spec_id = psm_node['id']
105
+ spec_num = spec_id.split("_")[1].to_i
106
+ spec_ref = spec_id.split("_")[-1].to_i
107
+ # store in object
108
+ psm = PSM.new(:spec_num => spec_num,
109
+ :spec_ref => spec_ref,
110
+ :pep => pep_seq,
111
+ :spec_prob => spec_prob.to_f,
112
+ :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
113
+ :pep_ev => pep_ev_lst
114
+ )
115
+ # yield psm object
116
+ yield psm
117
+ end
118
+ pbar.inc if use_pbar
119
+ end
120
+ pbar.finish if use_pbar
121
+ end
122
+ end
123
+ #
124
+ # for each spectrum, return a list of PSM objects for that spectrum
125
+ #
126
+ def each_spectrum(use_pbar=nil)
127
+ spec_lst = []
128
+ self.each_psm(use_pbar) do |psm|
129
+ if spec_lst.empty? then
130
+ spec_lst.push(psm)
131
+ else
132
+ if spec_lst[-1].get_spec_num == psm.get_spec_num then
133
+ spec_lst.push(psm)
134
+ else # found new spec num, yield psm list
135
+ yield spec_lst
136
+ spec_lst = [psm] # add new to list
137
+ end
138
+ end
139
+ end
140
+ yield spec_lst
141
+ end
142
+
143
+
144
+ private :cache_ids
145
+
146
+ end
147
+
148
+ end
@@ -0,0 +1,257 @@
1
+ require 'nokogiri'
2
+ require 'progressbar'
3
+ require 'mzid/base_parser'
4
+ require 'mzid/streaming_parser'
5
+
6
+ module MzID
7
+ #
8
+ # class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner
9
+ # performs multi-pass filtering so that can maintain smallest datastruct in memory
10
+ # 1) first collect counts of elements
11
+ # 2) get list of peptide evidence from PSMs that pass filter
12
+ # 3)
13
+ #
14
+ class FilteredStreamingParser < StreamingParser
15
+
16
+ def initialize(file, sp_thresh = 10.0**-10, use_pbar = nil)
17
+ @num_spec = 0
18
+ #
19
+ @pep_ev_h_protID = Hash.new
20
+ @pep_ev_h_startPos = Hash.new
21
+ @pep_ev_h_endPos = Hash.new
22
+ @pep_ev_h_dbseqRef = Hash.new
23
+ super(file, use_pbar)
24
+ end
25
+ #
26
+ #
27
+ def cache_ids2(use_pbar = @use_pbar)
28
+ end
29
+
30
+ #def get_pep_ev_protID(pid) @pep_ev_h_protID[pid] end
31
+
32
+ def get_prot_id(pep_ev_id)
33
+ dbref = @pep_ev_h_dbseqRef[pep_ev_id]
34
+ prot_id = @db_seq_h[dbref]
35
+ prot_id
36
+ end
37
+
38
+ attr_accessor :pep_ev_h_dbseqRef
39
+
40
+ #
41
+ # store peptide sequences in hash for lookup
42
+ #
43
+ def cache_ids(use_pbar = @use_pbar)
44
+ num_pep, num_db_seq, num_pep_ev = get_num_elements(nil)
45
+ puts "SPEC:\t#{@num_spec}"
46
+ puts "PEP:\t#{num_pep}"
47
+ puts "DB:\t#{num_db_seq}"
48
+ puts "PEPEV:\t#{num_pep_ev}"
49
+
50
+ #pbar1 = ProgressBar.new("Caching psm", num_pep) if use_pbar
51
+ #reader = Nokogiri::XML::Reader(File.open(@mzid_file))
52
+ #reader.each do |node|
53
+ #end
54
+
55
+ @pep_h = Hash.new
56
+ @mod_h = Hash.new
57
+ #pbar = ProgressBar.new("Caching", num_pep+num_db_seq+num_pep_ev) if use_pbar
58
+ pbar1 = ProgressBar.new("peptides", num_pep/2) if use_pbar
59
+ reader = Nokogiri::XML::Reader(File.open(@mzid_file))
60
+ reader.each do |node|
61
+ #
62
+ if node.name == "Peptide" then
63
+ #pbar.inc if use_pbar
64
+ # parse local peptide entry
65
+ tmp_node = Nokogiri::XML.parse(node.outer_xml)
66
+ tmp_node.remove_namespaces!
67
+ root = tmp_node.root
68
+ pep_id = root["id"].to_sym
69
+ # skip if already handled PepID
70
+ next if @pep_h.has_key?(pep_id)
71
+ # parse sequence/mods if haven't seen it yet
72
+ pep_seq = get_peptide_sequence(root)
73
+ mod_line = get_modifications(root)
74
+ @pep_h[pep_id] = pep_seq
75
+ @mod_h[pep_id] = mod_line
76
+ pbar1.inc if use_pbar
77
+ end
78
+ end
79
+ pbar1.finish if use_pbar
80
+ #
81
+ pbar2 = ProgressBar.new("db_seq", num_db_seq) if use_pbar
82
+ IO.foreach(@mzid_file) do |line|
83
+ next if !line.match(/^\s+<DBSequence\s/)
84
+
85
+ prot_id = line.match(/accession=\"([\w|\|]+)/)[1]
86
+ db_id = line.match(/id=\"(\w+)/)[1]
87
+
88
+ @db_seq_h[db_id.to_sym] = prot_id.to_sym
89
+ pbar2.inc if use_pbar
90
+ end
91
+ # reader2 = Nokogiri::XML::Reader(File.open(@mzid_file))
92
+ # reader2.each do |node|
93
+ # #
94
+ # if node.name == "DBSequence" then
95
+ # # parse local DBSequence entry
96
+ # tmp_node = Nokogiri::XML.parse(node.outer_xml)
97
+ # tmp_node.remove_namespaces!
98
+ # root = tmp_node.root
99
+ # cache_db_seq_entries(root)
100
+ # pbar2.inc if use_pbar
101
+ # end
102
+ # end
103
+ pbar2.finish if use_pbar
104
+ #
105
+ pbar3 = ProgressBar.new("pep_ev", num_pep_ev) if use_pbar
106
+ IO.foreach(@mzid_file) do |line|
107
+ next if !line.match(/^\s+<PeptideEvidence\s/)
108
+
109
+ db_id = line.match(/dBSequence_ref=\"(\w+)/)[1]
110
+ pep_ev = line.match(/id=\"(\w+)/)[1]
111
+ @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym
112
+ pbar3.inc if use_pbar
113
+ end
114
+ # reader3 = Nokogiri::XML::Reader(File.open(@mzid_file))
115
+ # reader3.each do |node|
116
+ # if node.name == "PeptideEvidence" then
117
+ # # parse local DBSequence entry
118
+ # tmp_node = Nokogiri::XML.parse(node.outer_xml)
119
+ # tmp_node.remove_namespaces!
120
+ # root = tmp_node.root
121
+ # cache_pep_ev(root)
122
+ # pbar3.inc if use_pbar
123
+ # end
124
+ # # if node.name == "PeptideEvidence" then
125
+ # # tmp_node = Nokogiri::XML.parse(node.outer_xml)
126
+ # # root = tmp_node.root
127
+ # # pep_ref = root.to_s.match(/peptide_ref=\"(\w+)\"/)[1]
128
+ # # id_ref = root.to_s.match(/id=\"(\w+)\"/)[1]
129
+ # # db_ref = root.to_s.match(/dBSequence_ref=\"(\w+)\"/)[1]
130
+ # # @pep_ev_h_dbseqRef[id_ref.to_sym] = db_ref.to_sym
131
+ # # end
132
+ # end
133
+ pbar3.finish if use_pbar
134
+ puts "PEP_H SIZE:\t#{@pep_h.size}"
135
+ puts "DBSEQ_H SIZE:\t#{@db_seq_h.size}"
136
+ puts "PEP_EV_H SIZE:\t#{@pep_ev_h_dbseqRef.size}"
137
+ end
138
+ #
139
+ # store database sequence entries (ids)
140
+ #
141
+ def cache_db_seq_entries(root)
142
+ dbseq_lst = root.xpath('//DBSequence')
143
+ dbseq_lst.each do |dnode|
144
+ id = dnode["id"].to_sym
145
+ acc_id = dnode["accession"]
146
+ @db_seq_h[id] = acc_id.to_sym
147
+ end
148
+ end
149
+ #
150
+ # store peptide evidence sequences in hash for lookup
151
+ #
152
+ def cache_pep_ev(root)
153
+ pep_ev_lst = root.xpath('//PeptideEvidence')
154
+ pep_ev_lst.each do |pnode|
155
+ id = pnode["id"].to_sym
156
+ # @pep_ev_h[id] =
157
+ # PeptideEvidence.new(#:id => pnode["id"],
158
+ # :db_seq_ref => pnode["dBSequence_ref"],
159
+ # #:pep_id => pnode["peptide_ref"],
160
+ # :start_pos => pnode["start"].to_i,
161
+ # :end_pos => pnode["end"].to_i,
162
+ # #:pre => pnode["pre"],
163
+ # #:post => pnode["post"],
164
+ # :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
165
+
166
+ # @pep_ev_h_protID[id.to_sym] = @db_seq_h[pnode["dBSequence_ref"]].to_sym
167
+ # @pep_ev_h_startPos[id.to_sym] = pnode["start"].to_i,
168
+ # @pep_ev_h_endPos[id.to_sym] = pnode["end"].to_i
169
+ @pep_ev_h_dbseqRef[id.to_sym] = pnode["dBSequence_ref"].to_sym
170
+ end
171
+ end
172
+ #
173
+ # iterate through each psm
174
+ #
175
+ def each_psm(use_pbar=@use_pbar)
176
+ hit_values = File.open(@mzid_file) do |io|
177
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
178
+ doc.remove_namespaces!
179
+ root = doc.root
180
+ # get list of identifications
181
+ spec_results = root.xpath('//SpectrumIdentificationResult')
182
+ pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar
183
+ spec_results.each do |sres|
184
+ #
185
+ psms_of_spec = sres.xpath('.//SpectrumIdentificationItem')
186
+ # go over each PSM from the spectra
187
+ psms_of_spec.each do |psm_node|
188
+ psm = get_psm(psm_node)
189
+ # yield psm object
190
+ yield psm
191
+ end
192
+ pbar.inc if use_pbar
193
+ end
194
+ pbar.finish if use_pbar
195
+ end
196
+ end
197
+ #
198
+ # given a xml node of a psm, return the PSM
199
+ #
200
+ def get_psm(psm_node)
201
+ # get peptide evidence list
202
+ pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
203
+ pep_ev_lst = pep_ev_raw_lst.map{|penode| pep_ev_ref_id = penode["peptideEvidence_ref"].to_sym}
204
+ # get cvparams
205
+ cvlst = psm_node.xpath('.//cvParam')
206
+ # find spectral prob
207
+ tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
208
+ spec_prob = tmp_lst[0]['value']
209
+ # get peptide
210
+ pep_seq = @pep_h[psm_node['peptide_ref'].to_sym]
211
+ # get spectrum id/ref number
212
+ spec_id = psm_node['id']
213
+ spec_num = spec_id.split("_")[1].to_i
214
+ spec_ref = spec_id.split("_")[-1].to_i
215
+ #
216
+ # store in object
217
+ psm = PSM.new(:spec_num => spec_num,
218
+ :spec_ref => spec_ref,
219
+ :pep => pep_seq,
220
+ :spec_prob => spec_prob.to_f,
221
+ :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
222
+ :pep_ev => pep_ev_lst)
223
+ end
224
+ #
225
+ # load PSMs into memory, and go back to perform lookup for prot ids
226
+ #
227
+ def write_to_file(outfile, use_pbar=@use_pbar)
228
+
229
+ pbar3 = ProgressBar.new("Caching pep_ev", num_db_seq) if use_pbar
230
+ t1_db = Time.now
231
+ reader3 = Nokogiri::XML::Reader(File.open(@mzid_file))
232
+ reader3.each do |node|
233
+ if node.name == "PeptideEvidence" then
234
+ # parse local DBSequence entry
235
+ tmp_node = Nokogiri::XML.parse(node.outer_xml)
236
+ tmp_node.remove_namespaces!
237
+ root = tmp_node.root
238
+ #cache_pep_ev(root)
239
+ pep_ev_lst = root.xpath('//PeptideEvidence')
240
+ pep_ev_lst.each do |pnode|
241
+ id = pnode["id"]
242
+ start_pos = pnode["start"].to_i,
243
+ end_pos = pnode["end"].to_i
244
+ db_seq_ref = pnode["dBSequence_ref"].to_sym
245
+ end
246
+ pbar3.inc if use_pbar
247
+ end
248
+
249
+ end
250
+ pbar3.finish if use_pbar
251
+
252
+ end
253
+
254
+
255
+ end
256
+
257
+ end