mspire 0.3.9 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +24 -7
- data/README +15 -13
- data/README.rdoc +18 -0
- data/Rakefile +50 -14
- data/bin/aafreqs.rb +0 -0
- data/bin/bioworks2excel.rb +0 -0
- data/bin/bioworks_to_pepxml.rb +2 -1
- data/bin/bioworks_to_pepxml_gui.rb +0 -0
- data/bin/fasta_shaker.rb +0 -0
- data/bin/filter_and_validate.rb +0 -0
- data/bin/gi2annot.rb +0 -0
- data/bin/id_class_anal.rb +0 -0
- data/bin/id_precision.rb +0 -0
- data/bin/ms_to_lmat.rb +0 -0
- data/bin/pepproph_filter.rb +0 -0
- data/bin/protein_summary.rb +0 -0
- data/bin/protxml2prots_peps.rb +0 -0
- data/bin/raw_to_mzXML.rb +3 -3
- data/bin/run_percolator.rb +122 -0
- data/bin/sqt_group.rb +0 -0
- data/bin/srf_group.rb +0 -0
- data/changelog.txt +29 -0
- data/lib/ms/gradient_program.rb +0 -1
- data/lib/ms/msrun.rb +62 -29
- data/lib/ms/parser/mzdata/axml.rb +55 -0
- data/lib/ms/parser/mzdata/dom.rb +51 -36
- data/lib/ms/parser/mzdata.rb +8 -2
- data/lib/ms/parser/mzxml/axml.rb +59 -0
- data/lib/ms/parser/mzxml/dom.rb +80 -57
- data/lib/ms/parser/mzxml/hpricot.rb +1 -1
- data/lib/ms/parser/mzxml/libxml.rb +6 -2
- data/lib/ms/parser/mzxml.rb +110 -3
- data/lib/ms/parser.rb +4 -4
- data/lib/ms/precursor.rb +19 -4
- data/lib/ms/scan.rb +7 -7
- data/lib/ms/spectrum.rb +249 -58
- data/lib/mspire.rb +1 -1
- data/lib/spec_id/bioworks.rb +2 -2
- data/lib/spec_id/precision/filter/cmdline.rb +8 -1
- data/lib/spec_id/precision/prob/cmdline.rb +2 -2
- data/lib/spec_id/precision/prob.rb +1 -0
- data/lib/spec_id/proph/pep_summary.rb +3 -4
- data/lib/spec_id/proph/prot_summary.rb +3 -3
- data/lib/spec_id/protein_summary.rb +1 -1
- data/lib/spec_id/sequest/pepxml.rb +5 -5
- data/lib/spec_id/sqt.rb +4 -4
- data/lib/spec_id/srf.rb +49 -8
- data/lib/spec_id.rb +5 -0
- data/lib/xml_style_parser.rb +16 -2
- data/script/compile_and_plot_smriti_final.rb +0 -0
- data/script/create_little_pepxml.rb +0 -0
- data/script/degenerate_peptides.rb +0 -0
- data/script/estimate_fpr_by_cysteine.rb +0 -0
- data/script/extract_gradient_programs.rb +1 -1
- data/script/find_cysteine_background.rb +0 -0
- data/script/genuine_tps_and_probs.rb +0 -0
- data/script/get_apex_values_rexml.rb +0 -0
- data/script/mascot_fix_pepxml.rb +123 -0
- data/script/msvis.rb +0 -0
- data/script/mzXML2timeIndex.rb +0 -0
- data/script/peps_per_bin.rb +0 -0
- data/script/prep_dir.rb +0 -0
- data/script/simple_protein_digestion.rb +0 -0
- data/script/smriti_final_analysis.rb +0 -0
- data/script/sqt_to_meta.rb +0 -0
- data/script/top_hit_per_scan.rb +0 -0
- data/script/toppred_to_yaml.rb +0 -0
- data/script/tpp_installer.rb +0 -0
- data/specs/bin/prob_validate_spec.rb +5 -2
- data/specs/bin/protein_summary_spec.rb +5 -1
- data/specs/ms/msrun_spec.rb +176 -133
- data/specs/ms/parser_spec.rb +3 -3
- data/specs/ms/spectrum_spec.rb +0 -2
- data/specs/spec_id/precision/filter_spec.rb +4 -1
- data/specs/spec_id/precision/prob_spec.rb +2 -2
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
- data/specs/spec_id/sqt_spec.rb +5 -5
- data/specs/spec_id/srf_spec.rb +56 -93
- data/specs/spec_id/srf_spec_helper.rb +121 -284
- data/specs/spec_id_spec.rb +3 -0
- data/specs/transmem/toppred_spec.rb +1 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
- data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
- metadata +247 -229
|
@@ -4,9 +4,64 @@ class MS::Parser::MzData::AXML < MS::Parser::MzData::DOM
|
|
|
4
4
|
def get_root_node_from_file(file)
|
|
5
5
|
::AXML.parse_file(file)
|
|
6
6
|
end
|
|
7
|
+
def get_root_node_from_io(io)
|
|
8
|
+
::AXML.parse(io)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class MS::Parser::MzData::AXML::LazyData < MS::Parser::MzData::AXML
|
|
13
|
+
def get_root_node_from_string(string)
|
|
14
|
+
::AXML::LazyData.parse(string)
|
|
15
|
+
end
|
|
16
|
+
def get_root_node_from_file(file)
|
|
17
|
+
::AXML::LazyData.parse_file(file)
|
|
18
|
+
end
|
|
19
|
+
def get_root_node_from_io(io)
|
|
20
|
+
::AXML::LazyData.parse(io)
|
|
21
|
+
end
|
|
7
22
|
end
|
|
8
23
|
|
|
24
|
+
class AXML::LazyData < AXML
|
|
25
|
+
# Returns the root node (as Element) or nodes (as Array)
|
|
26
|
+
def self.parse(stream)
|
|
27
|
+
parser = ::AXML::XMLParser::LazyData.new
|
|
28
|
+
parser.parse(stream)
|
|
29
|
+
parser.root
|
|
30
|
+
end
|
|
31
|
+
end
|
|
9
32
|
|
|
33
|
+
# This parser stores information about where the data (peaks) information is
|
|
34
|
+
# in the file
|
|
35
|
+
# The content of the data node is an array where the first member is the
|
|
36
|
+
# start index and the last member is the number of bytes. All other members
|
|
37
|
+
# should be ignored.
|
|
38
|
+
class AXML::XMLParser::LazyData < ::AXML::XMLParser
|
|
10
39
|
|
|
40
|
+
def startElement(name, attributes)
|
|
41
|
+
text =
|
|
42
|
+
if name == 'data' ; []
|
|
43
|
+
else ; ''
|
|
44
|
+
end
|
|
45
|
+
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
|
46
|
+
# add the new node to the previous parent node
|
|
47
|
+
@cur.add_node(new_el)
|
|
48
|
+
# notice the change in @cur node
|
|
49
|
+
@cur = new_el
|
|
50
|
+
end
|
|
11
51
|
|
|
52
|
+
def character(data)
|
|
53
|
+
if @cur.text.is_a? Array
|
|
54
|
+
@cur.text << byteIndex
|
|
55
|
+
else
|
|
56
|
+
@cur.text << data
|
|
57
|
+
end
|
|
58
|
+
end
|
|
12
59
|
|
|
60
|
+
def endElement(name)
|
|
61
|
+
if @cur.text.is_a? Array
|
|
62
|
+
@cur.text << (byteIndex - @cur.text.first)
|
|
63
|
+
end
|
|
64
|
+
@cur = @cur.parent
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
data/lib/ms/parser/mzdata/dom.rb
CHANGED
|
@@ -28,11 +28,7 @@ class MS::Parser::MzData::DOM
|
|
|
28
28
|
|
|
29
29
|
# OPTIONS:
|
|
30
30
|
# :msrun => MSRun # use this object instead of creating one
|
|
31
|
-
# :spectra => *true|false # if false don't get spectra
|
|
32
31
|
def msrun(file, opts={})
|
|
33
|
-
unless opts.key?(:spectra)
|
|
34
|
-
opts[:spectra] = true
|
|
35
|
-
end
|
|
36
32
|
msrun_obj =
|
|
37
33
|
if x = opts[:msrun]
|
|
38
34
|
msrun_obj = x
|
|
@@ -48,9 +44,18 @@ class MS::Parser::MzData::DOM
|
|
|
48
44
|
id_to_scan_hash = {}
|
|
49
45
|
|
|
50
46
|
# 0 1 2 3 4 5 6
|
|
51
|
-
# %w(num msLevel retentionTime startMz endMz
|
|
47
|
+
# %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
|
48
|
+
|
|
49
|
+
io =
|
|
50
|
+
if file.is_a? String
|
|
51
|
+
filename = file
|
|
52
|
+
File.open(file)
|
|
53
|
+
else
|
|
54
|
+
file
|
|
55
|
+
end
|
|
56
|
+
root = get_root_node_from_io(io)
|
|
57
|
+
|
|
52
58
|
|
|
53
|
-
root = get_root_node_from_file(file)
|
|
54
59
|
description = root.find_first('child::description')
|
|
55
60
|
bioworks33 = is_bioworks33?(description)
|
|
56
61
|
spectrum_list = description.next
|
|
@@ -91,49 +96,57 @@ class MS::Parser::MzData::DOM
|
|
|
91
96
|
end
|
|
92
97
|
if scan[1] > 1 # precursormz info
|
|
93
98
|
prec_list_n = spec_settings_n.next
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
when 'ChargeState'
|
|
112
|
-
charges << cv_param_n['value'].to_i
|
|
99
|
+
raise RuntimeError, "MSRun objects can only accept 1 precursor" if prec_list_n['count'] != '1'
|
|
100
|
+
prec_n = prec_list_n.find_first('child::precursor')
|
|
101
|
+
# %w(mz inten parent ms_level parent charge_states)
|
|
102
|
+
prec = MS::Precursor.new
|
|
103
|
+
unless bioworks33 # bioworks33 points to the wrong scan!!!
|
|
104
|
+
prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
|
|
105
|
+
end
|
|
106
|
+
# we're not keeping track of this guy anymore
|
|
107
|
+
# prec[3] = prec_n['msLevel'].to_i
|
|
108
|
+
charges = []
|
|
109
|
+
prec_n.find('descendant::cvParam').each do |cv_param_n|
|
|
110
|
+
case cv_param_n['name']
|
|
111
|
+
when 'MassToChargeRatio'
|
|
112
|
+
prec[0] = cv_param_n['value'].to_f
|
|
113
|
+
# find the prec intensity
|
|
114
|
+
unless bioworks33
|
|
115
|
+
prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
|
|
113
116
|
end
|
|
117
|
+
when 'ChargeState'
|
|
118
|
+
charges << cv_param_n['value'].to_i
|
|
114
119
|
end
|
|
115
|
-
prec[5] = charges
|
|
116
|
-
prec
|
|
117
120
|
end
|
|
118
|
-
|
|
121
|
+
prec[3] = charges
|
|
122
|
+
scan[5] = prec
|
|
119
123
|
else # no precursors
|
|
120
|
-
scan[5] =
|
|
124
|
+
scan[5] = nil
|
|
121
125
|
end
|
|
122
126
|
# here's the one line way of doing it, but it's probably more clear in
|
|
123
127
|
# the loop
|
|
124
128
|
#while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
|
|
125
|
-
|
|
129
|
+
unless opts[:lazy] == :no_spectra
|
|
126
130
|
mz_array_bin_n = nil
|
|
127
131
|
loop do
|
|
128
132
|
mz_array_bin_n = spec_desc_n.next
|
|
129
133
|
break if mz_array_bin_n.name == 'mzArrayBinary'
|
|
130
134
|
end
|
|
131
|
-
|
|
132
|
-
mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
|
|
135
|
+
mz_data_n = mz_array_bin_n.child
|
|
133
136
|
inten_array_bin_n = mz_array_bin_n.next
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
+
inten_data_n = inten_array_bin_n.child
|
|
138
|
+
case opts[:lazy]
|
|
139
|
+
when :string
|
|
140
|
+
scan[6] = MS::Spectrum::LazyString.from_base64_pair(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true) )
|
|
141
|
+
when :io
|
|
142
|
+
mz_data_n_content = mz_data_n.content
|
|
143
|
+
i_data_n_content = inten_data_n.content
|
|
144
|
+
scan[6] = MS::Spectrum::LazyIO.new(io, mz_data_n_content.first, mz_data_n_content.last, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), i_data_n_content.first, i_data_n_content.last, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
|
145
|
+
when :not
|
|
146
|
+
mz = MS::Spectrum.base64_to_array(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true))
|
|
147
|
+
inten = MS::Spectrum.base64_to_array(inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
|
148
|
+
scan[6] = MS::Spectrum.new(mz, inten)
|
|
149
|
+
end
|
|
137
150
|
end
|
|
138
151
|
|
|
139
152
|
# set up the next loop
|
|
@@ -141,7 +154,7 @@ class MS::Parser::MzData::DOM
|
|
|
141
154
|
end
|
|
142
155
|
end
|
|
143
156
|
if bioworks33
|
|
144
|
-
MS::MSRun.add_parent_scan(scans, opts[:
|
|
157
|
+
MS::MSRun.add_parent_scan(scans, ((opts[:lazy] == :not) ? true : false))
|
|
145
158
|
end
|
|
146
159
|
msrun_obj.scans = scans
|
|
147
160
|
msrun_obj.scan_count = scans.size
|
|
@@ -152,6 +165,8 @@ class MS::Parser::MzData::DOM
|
|
|
152
165
|
end
|
|
153
166
|
msrun_obj.start_time = msrun_obj.scans.first.time
|
|
154
167
|
msrun_obj.end_time = msrun_obj.scans.last.time
|
|
168
|
+
|
|
169
|
+
io.close if filename
|
|
155
170
|
end
|
|
156
171
|
|
|
157
172
|
end
|
data/lib/ms/parser/mzdata.rb
CHANGED
|
@@ -11,12 +11,18 @@ module MS::Parser::MzData
|
|
|
11
11
|
|
|
12
12
|
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
|
13
13
|
# based on choose_parser from xml_style_parser
|
|
14
|
-
def self.new(parse_type=:msrun, version='1.05')
|
|
14
|
+
def self.new(parse_type=:msrun, version='1.05', opts={})
|
|
15
|
+
special_subclass =
|
|
16
|
+
if opts[:lazy] == :io
|
|
17
|
+
'LazyData'
|
|
18
|
+
else ; nil
|
|
19
|
+
end
|
|
20
|
+
|
|
15
21
|
@version = version
|
|
16
22
|
@method = parse_type
|
|
17
23
|
#p self.methods.grep /choose_parser/
|
|
18
24
|
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
|
19
|
-
parser_class = XMLStyleParser.choose_parser(self, parse_type)
|
|
25
|
+
parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
|
|
20
26
|
parser = parser_class.new(parse_type, version)
|
|
21
27
|
end
|
|
22
28
|
|
data/lib/ms/parser/mzxml/axml.rb
CHANGED
|
@@ -7,5 +7,64 @@ class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
|
|
|
7
7
|
def get_root_node_from_file(file)
|
|
8
8
|
::AXML.parse_file(file)
|
|
9
9
|
end
|
|
10
|
+
def get_root_node_from_io(io)
|
|
11
|
+
::AXML.parse(io)
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
class MS::Parser::MzXML::AXML::LazyPeaks < MS::Parser::MzXML::AXML
|
|
16
|
+
def get_root_node_from_string(string)
|
|
17
|
+
::AXML::LazyPeaks.parse(string)
|
|
18
|
+
end
|
|
19
|
+
def get_root_node_from_file(file)
|
|
20
|
+
::AXML::LazyPeaks.parse_file(file)
|
|
21
|
+
end
|
|
22
|
+
def get_root_node_from_io(io)
|
|
23
|
+
::AXML::LazyPeaks.parse(io)
|
|
24
|
+
end
|
|
10
25
|
end
|
|
11
26
|
|
|
27
|
+
class AXML::LazyPeaks < AXML
|
|
28
|
+
# Returns the root node (as Element) or nodes (as Array)
|
|
29
|
+
def self.parse(stream)
|
|
30
|
+
parser = ::AXML::XMLParser::LazyPeaks.new
|
|
31
|
+
parser.parse(stream)
|
|
32
|
+
parser.root
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# This parser stores information about where the peaks information is in the
|
|
37
|
+
# file
|
|
38
|
+
# The content of the peaks node is an array where the first member is the
|
|
39
|
+
# start index and the last member is the number of bytes. All other members
|
|
40
|
+
# should be ignored.
|
|
41
|
+
class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
|
|
42
|
+
|
|
43
|
+
def startElement(name, attributes)
|
|
44
|
+
text =
|
|
45
|
+
if name == 'peaks' ; []
|
|
46
|
+
else ; ''
|
|
47
|
+
end
|
|
48
|
+
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
|
49
|
+
# add the new node to the previous parent node
|
|
50
|
+
@cur.add_node(new_el)
|
|
51
|
+
# notice the change in @cur node
|
|
52
|
+
@cur = new_el
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def character(data)
|
|
56
|
+
if @cur.text.is_a? Array
|
|
57
|
+
@cur.text << byteIndex
|
|
58
|
+
else
|
|
59
|
+
@cur.text << data
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def endElement(name)
|
|
64
|
+
if @cur.text.is_a? Array
|
|
65
|
+
@cur.text << (byteIndex - @cur.text.first)
|
|
66
|
+
end
|
|
67
|
+
@cur = @cur.parent
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
end
|
data/lib/ms/parser/mzxml/dom.rb
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
require 'xml_style_parser'
|
|
2
2
|
require 'ms/spectrum'
|
|
3
3
|
require 'ms/scan'
|
|
4
|
+
require 'ms/parser/mzxml'
|
|
5
|
+
require 'tempfile'
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class MS::Parser::MzXML::DOM
|
|
7
9
|
include XMLStyleParser
|
|
8
10
|
include MS::Parser::MzXML
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
NetworkOrder = true
|
|
13
|
+
|
|
14
|
+
#@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
|
11
15
|
|
|
12
16
|
def initialize(parse_type=:msrun, version='1.0')
|
|
13
17
|
@method = parse_type
|
|
@@ -18,7 +22,9 @@ class MS::Parser::MzXML::DOM
|
|
|
18
22
|
scan = MS::Scan.new # array class creates one with 9 positions
|
|
19
23
|
scan[0] = node['num'].to_i
|
|
20
24
|
scan[1] = node['msLevel'].to_i
|
|
21
|
-
|
|
25
|
+
if x = node['retentionTime']
|
|
26
|
+
scan[2] = x[2...-1].to_f
|
|
27
|
+
end
|
|
22
28
|
if x = node['startMz']
|
|
23
29
|
scan[3] = x.to_f
|
|
24
30
|
scan[4] = node['endMz'].to_f
|
|
@@ -26,39 +32,60 @@ class MS::Parser::MzXML::DOM
|
|
|
26
32
|
scan
|
|
27
33
|
end
|
|
28
34
|
|
|
35
|
+
# assumes that node contains scans and checks any scan nodes for children
|
|
36
|
+
def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
37
|
+
nodes.each do |scan_n|
|
|
38
|
+
scan = create_scan(scan_n, scans_by_num, lazy, io)
|
|
39
|
+
scans[scn_index] = scan
|
|
40
|
+
scans_by_num[scan[0]] = scan
|
|
41
|
+
scn_index += 1
|
|
42
|
+
if @version > '1.0'
|
|
43
|
+
new_nodes = scan_n.find('child::scan')
|
|
44
|
+
if new_nodes.size > 0
|
|
45
|
+
scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
scn_index
|
|
50
|
+
end
|
|
51
|
+
|
|
29
52
|
# takes a scan node and creates a scan object
|
|
30
53
|
# the parent scan is the one directly above it in mslevel
|
|
31
|
-
#
|
|
32
|
-
def create_scan(scan_n, scans_by_num,
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
# lazy must be a symbol from MS::MSRun.new
|
|
55
|
+
def create_scan(scan_n, scans_by_num, lazy, io=nil)
|
|
56
|
+
scan = new_scan_from_hash(scan_n)
|
|
57
|
+
prec = nil
|
|
58
|
+
scan_n.each do |node|
|
|
59
|
+
case node.name
|
|
60
|
+
when 'precursorMz'
|
|
61
|
+
# should be able to do this!!!
|
|
62
|
+
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
|
|
63
|
+
raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
|
|
64
|
+
prec = MS::Precursor.new
|
|
65
|
+
prec[1] = node['precursorIntensity'].to_f
|
|
66
|
+
prec[0] = node.content.to_f
|
|
67
|
+
if x = node['precursorScanNum']
|
|
68
|
+
prec[2] = scans_by_num[x.to_i]
|
|
69
|
+
end
|
|
70
|
+
when 'peaks'
|
|
71
|
+
case lazy
|
|
72
|
+
when :no_spectra
|
|
73
|
+
next
|
|
74
|
+
when :string
|
|
75
|
+
scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
|
|
76
|
+
when :io
|
|
77
|
+
# assumes that parsing was done with a LazyPeaks parser!
|
|
78
|
+
nc = node.content
|
|
79
|
+
scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
|
|
80
|
+
when :not
|
|
50
81
|
# SHOULD be able to do this!!
|
|
51
82
|
#peaks_n = scan_n.find_first('child::peaks')
|
|
52
83
|
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
|
|
53
84
|
end
|
|
54
85
|
end
|
|
55
|
-
scan[5] = precs
|
|
56
|
-
scan
|
|
57
|
-
else # for version > 3.0
|
|
58
|
-
abort 'not supporting version 3.0 just yet'
|
|
59
|
-
# note that mzXML version 3.0 *can* have more than one peak...
|
|
60
|
-
# I'm not sure how to deal with that since I have one spectrum/scan
|
|
61
86
|
end
|
|
87
|
+
scan[5] = prec
|
|
88
|
+
scan
|
|
62
89
|
end
|
|
63
90
|
|
|
64
91
|
|
|
@@ -67,23 +94,15 @@ class MS::Parser::MzXML::DOM
|
|
|
67
94
|
raise NotImplementedError
|
|
68
95
|
end
|
|
69
96
|
|
|
70
|
-
|
|
71
|
-
# </scan> tags after peaks added in
|
|
72
|
-
# we do this in windows style since these are generated off a windows
|
|
73
|
-
# machine only
|
|
74
|
-
def fix_bad_scan_tags(file)
|
|
75
|
-
IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
|
97
|
+
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
|
79
98
|
# this is built around a single run per file
|
|
80
99
|
# OPTIONS:
|
|
81
|
-
# :msrun => MSRun
|
|
82
|
-
# :
|
|
100
|
+
# :msrun => (an MSRun object) # use this object instead of creating one
|
|
101
|
+
# :lazy => [See MS::MSRun for documentation]
|
|
83
102
|
def msrun(file, opts={})
|
|
84
|
-
unless opts.key?(:spectra)
|
|
85
|
-
|
|
86
|
-
end
|
|
103
|
+
#unless opts.key?(:spectra)
|
|
104
|
+
# opts[:spectra] = true
|
|
105
|
+
#end
|
|
87
106
|
|
|
88
107
|
msrun_obj =
|
|
89
108
|
if x = opts[:msrun]
|
|
@@ -92,14 +111,20 @@ class MS::Parser::MzXML::DOM
|
|
|
92
111
|
MS::MSRun.new
|
|
93
112
|
end
|
|
94
113
|
|
|
95
|
-
|
|
96
|
-
if
|
|
97
|
-
|
|
98
|
-
|
|
114
|
+
io =
|
|
115
|
+
if file.is_a? String # a filename
|
|
116
|
+
filename = file
|
|
117
|
+
File.open(file)
|
|
99
118
|
else
|
|
100
|
-
|
|
119
|
+
file
|
|
101
120
|
end
|
|
102
121
|
|
|
122
|
+
root = get_root_node_from_io(io)
|
|
123
|
+
|
|
124
|
+
if filename
|
|
125
|
+
io.close # can close now
|
|
126
|
+
end
|
|
127
|
+
|
|
103
128
|
# right now we are only finding the first msRun (probably a rare case of
|
|
104
129
|
# multiple runs in an mzXML file...)
|
|
105
130
|
msrun_n =
|
|
@@ -118,7 +143,7 @@ class MS::Parser::MzXML::DOM
|
|
|
118
143
|
scan_count = msrun_n['scanCount'].to_i
|
|
119
144
|
msrun_obj.scan_count = scan_count
|
|
120
145
|
scans_by_num = Array.new(scan_count + 1)
|
|
121
|
-
|
|
146
|
+
|
|
122
147
|
## SPECTRUM
|
|
123
148
|
parent = nil
|
|
124
149
|
scans = Array.new( scan_count )
|
|
@@ -127,17 +152,16 @@ class MS::Parser::MzXML::DOM
|
|
|
127
152
|
# we should be able to do this, but it's not working!!!
|
|
128
153
|
#scan_n = msrun_n.find_first('scan')
|
|
129
154
|
#while (scn_index < scan_count)
|
|
130
|
-
|
|
155
|
+
lazy = opts[:lazy]
|
|
131
156
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
#sc = scan_n.next
|
|
137
|
-
scans_by_num[scan[0]] = scan
|
|
138
|
-
scn_index += 1
|
|
157
|
+
if @version >= '3.0'
|
|
158
|
+
warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
|
|
159
|
+
# note that mzXML version 3.0 *can* have more than one peak...
|
|
160
|
+
# I'm not sure how to deal with that since I have one spectrum/scan
|
|
139
161
|
end
|
|
140
162
|
|
|
163
|
+
scan_nodes = msrun_n.find('child::scan')
|
|
164
|
+
add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
141
165
|
|
|
142
166
|
## update the scan's parents
|
|
143
167
|
MS::MSRun.add_parent_scan(scans)
|
|
@@ -151,9 +175,8 @@ class MS::Parser::MzXML::DOM
|
|
|
151
175
|
msrun_obj.end_time = scans.last.time
|
|
152
176
|
|
|
153
177
|
msrun_obj.scans = scans
|
|
154
|
-
end
|
|
155
178
|
|
|
179
|
+
end
|
|
156
180
|
end
|
|
157
181
|
|
|
158
182
|
|
|
159
|
-
|
|
@@ -8,7 +8,7 @@ class MS::Parser::MzXML::Hpricot
|
|
|
8
8
|
include XMLStyleParser
|
|
9
9
|
include MS::Parser::MzXML
|
|
10
10
|
|
|
11
|
-
@@scan_atts = %w(num msLevel retentionTime startMz endMz
|
|
11
|
+
@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
|
12
12
|
|
|
13
13
|
def initialize(parse_type=:msrun, version='1.0')
|
|
14
14
|
@method = parse_type
|
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
require 'ms/parser/mzxml/dom'
|
|
3
3
|
|
|
4
4
|
class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
|
|
5
|
-
def
|
|
5
|
+
def get_root_node_from_string(string)
|
|
6
6
|
XML::Parser.string(string).parse.root
|
|
7
7
|
end
|
|
8
8
|
def get_root_node_from_file(file)
|
|
9
|
-
XML::
|
|
9
|
+
XML::Parser.filename(file).parse.root
|
|
10
10
|
end
|
|
11
|
+
def get_root_node_from_io(io)
|
|
12
|
+
XML::Parser.io(io).parse.root
|
|
13
|
+
end
|
|
14
|
+
|
|
11
15
|
end
|
|
12
16
|
|
|
13
17
|
|
data/lib/ms/parser/mzxml.rb
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require 'ms/msrun'
|
|
2
|
+
require 'fileutils'
|
|
2
3
|
|
|
3
4
|
module MS; end
|
|
4
5
|
|
|
@@ -7,14 +8,120 @@ module MS::Parser::MzXML
|
|
|
7
8
|
# inherits XMLStyleParser and version
|
|
8
9
|
include MS::Parser
|
|
9
10
|
include XMLStyleParser
|
|
10
|
-
|
|
11
|
+
|
|
12
|
+
# warning: clobbers file unless a newfilename is provided!
|
|
13
|
+
# returns the output filename
|
|
14
|
+
# will fix any size file!
|
|
15
|
+
def self.fix_bad_scan_tags(filename, newfilename=nil)
|
|
16
|
+
|
|
17
|
+
out_io =
|
|
18
|
+
if newfilename
|
|
19
|
+
File.open(newfilename, 'w')
|
|
20
|
+
else
|
|
21
|
+
Tempfile.new(File.basename(filename))
|
|
22
|
+
end
|
|
23
|
+
File.open(filename) do |fh|
|
|
24
|
+
self.fix_bad_scan_tags_from_io(fh, out_io)
|
|
25
|
+
end
|
|
26
|
+
out_io.close
|
|
27
|
+
unless newfilename
|
|
28
|
+
FileUtils.mv out_io.path, filename
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# this is a memory efficient method to fix bad scan tags
|
|
33
|
+
# prints cleaned up file to out_io
|
|
34
|
+
# no effort is made to rewind the io objects, the user must do this if they
|
|
35
|
+
# plan to continue using these objects!
|
|
36
|
+
def self.fix_bad_scan_tags_from_io(io, out_io)
|
|
37
|
+
regexp = /<\/scan>/
|
|
38
|
+
end_scan_line = false
|
|
39
|
+
|
|
40
|
+
io.each("\n") do |line|
|
|
41
|
+
if end_scan_line && line =~ regexp
|
|
42
|
+
# two end scan lines! # don't print to out_io
|
|
43
|
+
end_scan_line = true
|
|
44
|
+
elsif line =~ regexp
|
|
45
|
+
out_io.print(line)
|
|
46
|
+
end_scan_line = true
|
|
47
|
+
else
|
|
48
|
+
out_io.print(line)
|
|
49
|
+
end_scan_line = false
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# returns a string with double </scan></scan> tags into single and missing
|
|
55
|
+
# </scan> tags after peaks added in
|
|
56
|
+
# we do this in windows style since these are generated off a windows
|
|
57
|
+
# machine only
|
|
58
|
+
#def self.fix_bad_scan_tags(string)
|
|
59
|
+
# string.gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
|
|
60
|
+
#end
|
|
61
|
+
|
|
62
|
+
# returns true if it has the bad tag
|
|
63
|
+
def self.has_bad_scan_tag_from_string?(string)
|
|
64
|
+
if string.match(/<\/scan>\s+<\/scan>/m)
|
|
65
|
+
true
|
|
66
|
+
else
|
|
67
|
+
false
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.has_bad_scan_tag?(filename)
|
|
72
|
+
File.open(filename) do |fh|
|
|
73
|
+
self.has_bad_scan_tag_from_io?(fh)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# very efficient algorithm to check for malformed xml typical of readw
|
|
78
|
+
# output. The extra closing scan tags come after the last ms/ms scan in a
|
|
79
|
+
# cycle rewinds the io after looking
|
|
80
|
+
def self.has_bad_scan_tag_from_io?(io)
|
|
81
|
+
seen_first_ms_level = false
|
|
82
|
+
seen_higher_ms_level = false
|
|
83
|
+
cur_ms_level = 0
|
|
84
|
+
found_double_end_tag = false
|
|
85
|
+
found_end_tag = false
|
|
86
|
+
io.each("\n") do |line|
|
|
87
|
+
if line =~ /<\/scan>/
|
|
88
|
+
if found_end_tag # already found one!
|
|
89
|
+
found_double_end_tag = true
|
|
90
|
+
break
|
|
91
|
+
end
|
|
92
|
+
found_end_tag = true
|
|
93
|
+
else
|
|
94
|
+
found_end_tag = false
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
if line =~ /msLevel="(\d+)"/
|
|
98
|
+
cur_ms_level = $1.dup
|
|
99
|
+
if seen_first_ms_level && seen_higher_ms_level && cur_ms_level == '1'
|
|
100
|
+
break
|
|
101
|
+
end
|
|
102
|
+
if cur_ms_level == '1'
|
|
103
|
+
seen_first_ms_level = true
|
|
104
|
+
elsif cur_ms_level == '2'
|
|
105
|
+
seen_higher_ms_level = true
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
io.rewind
|
|
110
|
+
found_double_end_tag
|
|
111
|
+
end
|
|
112
|
+
|
|
11
113
|
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
|
12
114
|
# based on choose_parser from xml_style_parser
|
|
13
|
-
def self.new(parse_type=:msrun, version='1.0')
|
|
115
|
+
def self.new(parse_type=:msrun, version='1.0', opts={})
|
|
116
|
+
special_subclass =
|
|
117
|
+
if opts[:lazy] == :io
|
|
118
|
+
'LazyPeaks'
|
|
119
|
+
else ; nil
|
|
120
|
+
end
|
|
14
121
|
@version = version
|
|
15
122
|
@method = parse_type
|
|
16
123
|
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
|
17
|
-
parser_class = XMLStyleParser.choose_parser(self, parse_type)
|
|
124
|
+
parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
|
|
18
125
|
parser = parser_class.new(parse_type, version)
|
|
19
126
|
end
|
|
20
127
|
|