mspire 0.3.9 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/INSTALL +24 -7
  2. data/README +15 -13
  3. data/README.rdoc +18 -0
  4. data/Rakefile +50 -14
  5. data/bin/aafreqs.rb +0 -0
  6. data/bin/bioworks2excel.rb +0 -0
  7. data/bin/bioworks_to_pepxml.rb +2 -1
  8. data/bin/bioworks_to_pepxml_gui.rb +0 -0
  9. data/bin/fasta_shaker.rb +0 -0
  10. data/bin/filter_and_validate.rb +0 -0
  11. data/bin/gi2annot.rb +0 -0
  12. data/bin/id_class_anal.rb +0 -0
  13. data/bin/id_precision.rb +0 -0
  14. data/bin/ms_to_lmat.rb +0 -0
  15. data/bin/pepproph_filter.rb +0 -0
  16. data/bin/protein_summary.rb +0 -0
  17. data/bin/protxml2prots_peps.rb +0 -0
  18. data/bin/raw_to_mzXML.rb +3 -3
  19. data/bin/run_percolator.rb +122 -0
  20. data/bin/sqt_group.rb +0 -0
  21. data/bin/srf_group.rb +0 -0
  22. data/changelog.txt +29 -0
  23. data/lib/ms/gradient_program.rb +0 -1
  24. data/lib/ms/msrun.rb +62 -29
  25. data/lib/ms/parser/mzdata/axml.rb +55 -0
  26. data/lib/ms/parser/mzdata/dom.rb +51 -36
  27. data/lib/ms/parser/mzdata.rb +8 -2
  28. data/lib/ms/parser/mzxml/axml.rb +59 -0
  29. data/lib/ms/parser/mzxml/dom.rb +80 -57
  30. data/lib/ms/parser/mzxml/hpricot.rb +1 -1
  31. data/lib/ms/parser/mzxml/libxml.rb +6 -2
  32. data/lib/ms/parser/mzxml.rb +110 -3
  33. data/lib/ms/parser.rb +4 -4
  34. data/lib/ms/precursor.rb +19 -4
  35. data/lib/ms/scan.rb +7 -7
  36. data/lib/ms/spectrum.rb +249 -58
  37. data/lib/mspire.rb +1 -1
  38. data/lib/spec_id/bioworks.rb +2 -2
  39. data/lib/spec_id/precision/filter/cmdline.rb +8 -1
  40. data/lib/spec_id/precision/prob/cmdline.rb +2 -2
  41. data/lib/spec_id/precision/prob.rb +1 -0
  42. data/lib/spec_id/proph/pep_summary.rb +3 -4
  43. data/lib/spec_id/proph/prot_summary.rb +3 -3
  44. data/lib/spec_id/protein_summary.rb +1 -1
  45. data/lib/spec_id/sequest/pepxml.rb +5 -5
  46. data/lib/spec_id/sqt.rb +4 -4
  47. data/lib/spec_id/srf.rb +49 -8
  48. data/lib/spec_id.rb +5 -0
  49. data/lib/xml_style_parser.rb +16 -2
  50. data/script/compile_and_plot_smriti_final.rb +0 -0
  51. data/script/create_little_pepxml.rb +0 -0
  52. data/script/degenerate_peptides.rb +0 -0
  53. data/script/estimate_fpr_by_cysteine.rb +0 -0
  54. data/script/extract_gradient_programs.rb +1 -1
  55. data/script/find_cysteine_background.rb +0 -0
  56. data/script/genuine_tps_and_probs.rb +0 -0
  57. data/script/get_apex_values_rexml.rb +0 -0
  58. data/script/mascot_fix_pepxml.rb +123 -0
  59. data/script/msvis.rb +0 -0
  60. data/script/mzXML2timeIndex.rb +0 -0
  61. data/script/peps_per_bin.rb +0 -0
  62. data/script/prep_dir.rb +0 -0
  63. data/script/simple_protein_digestion.rb +0 -0
  64. data/script/smriti_final_analysis.rb +0 -0
  65. data/script/sqt_to_meta.rb +0 -0
  66. data/script/top_hit_per_scan.rb +0 -0
  67. data/script/toppred_to_yaml.rb +0 -0
  68. data/script/tpp_installer.rb +0 -0
  69. data/specs/bin/prob_validate_spec.rb +5 -2
  70. data/specs/bin/protein_summary_spec.rb +5 -1
  71. data/specs/ms/msrun_spec.rb +176 -133
  72. data/specs/ms/parser_spec.rb +3 -3
  73. data/specs/ms/spectrum_spec.rb +0 -2
  74. data/specs/spec_id/precision/filter_spec.rb +4 -1
  75. data/specs/spec_id/precision/prob_spec.rb +2 -2
  76. data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
  77. data/specs/spec_id/sqt_spec.rb +5 -5
  78. data/specs/spec_id/srf_spec.rb +56 -93
  79. data/specs/spec_id/srf_spec_helper.rb +121 -284
  80. data/specs/spec_id_spec.rb +3 -0
  81. data/specs/transmem/toppred_spec.rb +1 -0
  82. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
  83. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
  84. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
  85. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
  86. data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
  87. metadata +247 -229
@@ -4,9 +4,64 @@ class MS::Parser::MzData::AXML < MS::Parser::MzData::DOM
4
4
  def get_root_node_from_file(file)
5
5
  ::AXML.parse_file(file)
6
6
  end
7
+ def get_root_node_from_io(io)
8
+ ::AXML.parse(io)
9
+ end
10
+ end
11
+
12
+ class MS::Parser::MzData::AXML::LazyData < MS::Parser::MzData::AXML
13
+ def get_root_node_from_string(string)
14
+ ::AXML::LazyData.parse(string)
15
+ end
16
+ def get_root_node_from_file(file)
17
+ ::AXML::LazyData.parse_file(file)
18
+ end
19
+ def get_root_node_from_io(io)
20
+ ::AXML::LazyData.parse(io)
21
+ end
7
22
  end
8
23
 
24
+ class AXML::LazyData < AXML
25
+ # Returns the root node (as Element) or nodes (as Array)
26
+ def self.parse(stream)
27
+ parser = ::AXML::XMLParser::LazyData.new
28
+ parser.parse(stream)
29
+ parser.root
30
+ end
31
+ end
9
32
 
33
+ # This parser stores information about where the data (peaks) information is
34
+ # in the file
35
+ # The content of the data node is an array where the first member is the
36
+ # start index and the last member is the number of bytes. All other members
37
+ # should be ignored.
38
+ class AXML::XMLParser::LazyData < ::AXML::XMLParser
10
39
 
40
+ def startElement(name, attributes)
41
+ text =
42
+ if name == 'data' ; []
43
+ else ; ''
44
+ end
45
+ new_el = ::AXML::El.new(@cur, name, attributes, text, [])
46
+ # add the new node to the previous parent node
47
+ @cur.add_node(new_el)
48
+ # notice the change in @cur node
49
+ @cur = new_el
50
+ end
11
51
 
52
+ def character(data)
53
+ if @cur.text.is_a? Array
54
+ @cur.text << byteIndex
55
+ else
56
+ @cur.text << data
57
+ end
58
+ end
12
59
 
60
+ def endElement(name)
61
+ if @cur.text.is_a? Array
62
+ @cur.text << (byteIndex - @cur.text.first)
63
+ end
64
+ @cur = @cur.parent
65
+ end
66
+
67
+ end
@@ -28,11 +28,7 @@ class MS::Parser::MzData::DOM
28
28
 
29
29
  # OPTIONS:
30
30
  # :msrun => MSRun # use this object instead of creating one
31
- # :spectra => *true|false # if false don't get spectra
32
31
  def msrun(file, opts={})
33
- unless opts.key?(:spectra)
34
- opts[:spectra] = true
35
- end
36
32
  msrun_obj =
37
33
  if x = opts[:msrun]
38
34
  msrun_obj = x
@@ -48,9 +44,18 @@ class MS::Parser::MzData::DOM
48
44
  id_to_scan_hash = {}
49
45
 
50
46
  # 0 1 2 3 4 5 6
51
- # %w(num msLevel retentionTime startMz endMz precursors spectrum)
47
+ # %w(num msLevel retentionTime startMz endMz precursor spectrum)
48
+
49
+ io =
50
+ if file.is_a? String
51
+ filename = file
52
+ File.open(file)
53
+ else
54
+ file
55
+ end
56
+ root = get_root_node_from_io(io)
57
+
52
58
 
53
- root = get_root_node_from_file(file)
54
59
  description = root.find_first('child::description')
55
60
  bioworks33 = is_bioworks33?(description)
56
61
  spectrum_list = description.next
@@ -91,49 +96,57 @@ class MS::Parser::MzData::DOM
91
96
  end
92
97
  if scan[1] > 1 # precursormz info
93
98
  prec_list_n = spec_settings_n.next
94
- abort('can only process one precursor m/z right now!') if prec_list_n['count'] != '1'
95
- precursors = prec_list_n.find('child::precursor').map do |prec_n|
96
- # %w(mz inten parent ms_level parent charge_states)
97
- prec = MS::Precursor.new
98
- unless bioworks33 # bioworks33 points to the wrong scan!!!
99
- prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
100
- end
101
- prec[3] = prec_n['msLevel'].to_i
102
- charges = []
103
- prec_n.find('descendant::cvParam').each do |cv_param_n|
104
- case cv_param_n['name']
105
- when 'MassToChargeRatio'
106
- prec[0] = cv_param_n['value'].to_f
107
- # find the prec intensity
108
- unless bioworks33
109
- prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
110
- end
111
- when 'ChargeState'
112
- charges << cv_param_n['value'].to_i
99
+ raise RuntimeError, "MSRun objects can only accept 1 precursor" if prec_list_n['count'] != '1'
100
+ prec_n = prec_list_n.find_first('child::precursor')
101
+ # %w(mz inten parent ms_level parent charge_states)
102
+ prec = MS::Precursor.new
103
+ unless bioworks33 # bioworks33 points to the wrong scan!!!
104
+ prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
105
+ end
106
+ # we're not keeping track of this guy anymore
107
+ # prec[3] = prec_n['msLevel'].to_i
108
+ charges = []
109
+ prec_n.find('descendant::cvParam').each do |cv_param_n|
110
+ case cv_param_n['name']
111
+ when 'MassToChargeRatio'
112
+ prec[0] = cv_param_n['value'].to_f
113
+ # find the prec intensity
114
+ unless bioworks33
115
+ prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
113
116
  end
117
+ when 'ChargeState'
118
+ charges << cv_param_n['value'].to_i
114
119
  end
115
- prec[5] = charges
116
- prec
117
120
  end
118
- scan[5] = precursors
121
+ prec[3] = charges
122
+ scan[5] = prec
119
123
  else # no precursors
120
- scan[5] = []
124
+ scan[5] = nil
121
125
  end
122
126
  # here's the one line way of doing it, but it's probably more clear in
123
127
  # the loop
124
128
  #while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
125
- if opts[:spectra]
129
+ unless opts[:lazy] == :no_spectra
126
130
  mz_array_bin_n = nil
127
131
  loop do
128
132
  mz_array_bin_n = spec_desc_n.next
129
133
  break if mz_array_bin_n.name == 'mzArrayBinary'
130
134
  end
131
- data_n = mz_array_bin_n.child
132
- mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
135
+ mz_data_n = mz_array_bin_n.child
133
136
  inten_array_bin_n = mz_array_bin_n.next
134
- data_n = inten_array_bin_n.child
135
- inten = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
136
- scan[6] = MS::Spectrum.new(mz, inten)
137
+ inten_data_n = inten_array_bin_n.child
138
+ case opts[:lazy]
139
+ when :string
140
+ scan[6] = MS::Spectrum::LazyString.from_base64_pair(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true) )
141
+ when :io
142
+ mz_data_n_content = mz_data_n.content
143
+ i_data_n_content = inten_data_n.content
144
+ scan[6] = MS::Spectrum::LazyIO.new(io, mz_data_n_content.first, mz_data_n_content.last, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), i_data_n_content.first, i_data_n_content.last, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
145
+ when :not
146
+ mz = MS::Spectrum.base64_to_array(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true))
147
+ inten = MS::Spectrum.base64_to_array(inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
148
+ scan[6] = MS::Spectrum.new(mz, inten)
149
+ end
137
150
  end
138
151
 
139
152
  # set up the next loop
@@ -141,7 +154,7 @@ class MS::Parser::MzData::DOM
141
154
  end
142
155
  end
143
156
  if bioworks33
144
- MS::MSRun.add_parent_scan(scans, opts[:spectra])
157
+ MS::MSRun.add_parent_scan(scans, ((opts[:lazy] == :not) ? true : false))
145
158
  end
146
159
  msrun_obj.scans = scans
147
160
  msrun_obj.scan_count = scans.size
@@ -152,6 +165,8 @@ class MS::Parser::MzData::DOM
152
165
  end
153
166
  msrun_obj.start_time = msrun_obj.scans.first.time
154
167
  msrun_obj.end_time = msrun_obj.scans.last.time
168
+
169
+ io.close if filename
155
170
  end
156
171
 
157
172
  end
@@ -11,12 +11,18 @@ module MS::Parser::MzData
11
11
 
12
12
  # returns a specific parser MS::Parser::MzXML::#{ParserType}
13
13
  # based on choose_parser from xml_style_parser
14
- def self.new(parse_type=:msrun, version='1.05')
14
+ def self.new(parse_type=:msrun, version='1.05', opts={})
15
+ special_subclass =
16
+ if opts[:lazy] == :io
17
+ 'LazyData'
18
+ else ; nil
19
+ end
20
+
15
21
  @version = version
16
22
  @method = parse_type
17
23
  #p self.methods.grep /choose_parser/
18
24
  XMLStyleParser.require_parse_files(Base_dir_for_parsers)
19
- parser_class = XMLStyleParser.choose_parser(self, parse_type)
25
+ parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
20
26
  parser = parser_class.new(parse_type, version)
21
27
  end
22
28
 
@@ -7,5 +7,64 @@ class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
7
7
  def get_root_node_from_file(file)
8
8
  ::AXML.parse_file(file)
9
9
  end
10
+ def get_root_node_from_io(io)
11
+ ::AXML.parse(io)
12
+ end
13
+ end
14
+
15
+ class MS::Parser::MzXML::AXML::LazyPeaks < MS::Parser::MzXML::AXML
16
+ def get_root_node_from_string(string)
17
+ ::AXML::LazyPeaks.parse(string)
18
+ end
19
+ def get_root_node_from_file(file)
20
+ ::AXML::LazyPeaks.parse_file(file)
21
+ end
22
+ def get_root_node_from_io(io)
23
+ ::AXML::LazyPeaks.parse(io)
24
+ end
10
25
  end
11
26
 
27
+ class AXML::LazyPeaks < AXML
28
+ # Returns the root node (as Element) or nodes (as Array)
29
+ def self.parse(stream)
30
+ parser = ::AXML::XMLParser::LazyPeaks.new
31
+ parser.parse(stream)
32
+ parser.root
33
+ end
34
+ end
35
+
36
+ # This parser stores information about where the peaks information is in the
37
+ # file
38
+ # The content of the peaks node is an array where the first member is the
39
+ # start index and the last member is the number of bytes. All other members
40
+ # should be ignored.
41
+ class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
42
+
43
+ def startElement(name, attributes)
44
+ text =
45
+ if name == 'peaks' ; []
46
+ else ; ''
47
+ end
48
+ new_el = ::AXML::El.new(@cur, name, attributes, text, [])
49
+ # add the new node to the previous parent node
50
+ @cur.add_node(new_el)
51
+ # notice the change in @cur node
52
+ @cur = new_el
53
+ end
54
+
55
+ def character(data)
56
+ if @cur.text.is_a? Array
57
+ @cur.text << byteIndex
58
+ else
59
+ @cur.text << data
60
+ end
61
+ end
62
+
63
+ def endElement(name)
64
+ if @cur.text.is_a? Array
65
+ @cur.text << (byteIndex - @cur.text.first)
66
+ end
67
+ @cur = @cur.parent
68
+ end
69
+
70
+ end
@@ -1,13 +1,17 @@
1
1
  require 'xml_style_parser'
2
2
  require 'ms/spectrum'
3
3
  require 'ms/scan'
4
+ require 'ms/parser/mzxml'
5
+ require 'tempfile'
4
6
 
5
7
 
6
8
  class MS::Parser::MzXML::DOM
7
9
  include XMLStyleParser
8
10
  include MS::Parser::MzXML
9
11
 
10
- #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
12
+ NetworkOrder = true
13
+
14
+ #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
11
15
 
12
16
  def initialize(parse_type=:msrun, version='1.0')
13
17
  @method = parse_type
@@ -18,7 +22,9 @@ class MS::Parser::MzXML::DOM
18
22
  scan = MS::Scan.new # array class creates one with 9 positions
19
23
  scan[0] = node['num'].to_i
20
24
  scan[1] = node['msLevel'].to_i
21
- scan[2] = node['retentionTime'][2...-1].to_f
25
+ if x = node['retentionTime']
26
+ scan[2] = x[2...-1].to_f
27
+ end
22
28
  if x = node['startMz']
23
29
  scan[3] = x.to_f
24
30
  scan[4] = node['endMz'].to_f
@@ -26,39 +32,60 @@ class MS::Parser::MzXML::DOM
26
32
  scan
27
33
  end
28
34
 
35
+ # assumes that node contains scans and checks any scan nodes for children
36
+ def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
37
+ nodes.each do |scan_n|
38
+ scan = create_scan(scan_n, scans_by_num, lazy, io)
39
+ scans[scn_index] = scan
40
+ scans_by_num[scan[0]] = scan
41
+ scn_index += 1
42
+ if @version > '1.0'
43
+ new_nodes = scan_n.find('child::scan')
44
+ if new_nodes.size > 0
45
+ scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
46
+ end
47
+ end
48
+ end
49
+ scn_index
50
+ end
51
+
29
52
  # takes a scan node and creates a scan object
30
53
  # the parent scan is the one directly above it in mslevel
31
- # if the
32
- def create_scan(scan_n, scans_by_num, get_spectra=true)
33
- if @version < '3.0'
34
- scan = new_scan_from_hash(scan_n)
35
- precs = []
36
- scan_n.each do |node|
37
- case node.name
38
- when 'precursorMz'
39
- # should be able to do this!!!
40
- #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
41
- prec = MS::Precursor.new
42
- prec[1] = node['precursorIntensity'].to_f
43
- prec[0] = node.content.to_f
44
- if x = node['precursorScanNum']
45
- prec[2] = scans_by_num[x.to_i]
46
- end
47
- precs << prec
48
- when 'peaks'
49
- next unless get_spectra
54
+ # lazy must be a symbol from MS::MSRun.new
55
+ def create_scan(scan_n, scans_by_num, lazy, io=nil)
56
+ scan = new_scan_from_hash(scan_n)
57
+ prec = nil
58
+ scan_n.each do |node|
59
+ case node.name
60
+ when 'precursorMz'
61
+ # should be able to do this!!!
62
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
63
+ raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
64
+ prec = MS::Precursor.new
65
+ prec[1] = node['precursorIntensity'].to_f
66
+ prec[0] = node.content.to_f
67
+ if x = node['precursorScanNum']
68
+ prec[2] = scans_by_num[x.to_i]
69
+ end
70
+ when 'peaks'
71
+ case lazy
72
+ when :no_spectra
73
+ next
74
+ when :string
75
+ scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
76
+ when :io
77
+ # assumes that parsing was done with a LazyPeaks parser!
78
+ nc = node.content
79
+ scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
80
+ when :not
50
81
  # SHOULD be able to do this!!
51
82
  #peaks_n = scan_n.find_first('child::peaks')
52
83
  scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
53
84
  end
54
85
  end
55
- scan[5] = precs
56
- scan
57
- else # for version > 3.0
58
- abort 'not supporting version 3.0 just yet'
59
- # note that mzXML version 3.0 *can* have more than one peak...
60
- # I'm not sure how to deal with that since I have one spectrum/scan
61
86
  end
87
+ scan[5] = prec
88
+ scan
62
89
  end
63
90
 
64
91
 
@@ -67,23 +94,15 @@ class MS::Parser::MzXML::DOM
67
94
  raise NotImplementedError
68
95
  end
69
96
 
70
- # returns a string with double </scan></scan> tags into single and missing
71
- # </scan> tags after peaks added in
72
- # we do this in windows style since these are generated off a windows
73
- # machine only
74
- def fix_bad_scan_tags(file)
75
- IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
76
- end
77
-
78
- # right now cannot parse multiple runs out of an mzXML version 2 file since
97
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
79
98
  # this is built around a single run per file
80
99
  # OPTIONS:
81
- # :msrun => MSRun # use this object instead of creating one
82
- # :spectra => *true|false # if false don't get spectra
100
+ # :msrun => (an MSRun object) # use this object instead of creating one
101
+ # :lazy => [See MS::MSRun for documentation]
83
102
  def msrun(file, opts={})
84
- unless opts.key?(:spectra)
85
- opts[:spectra] = true
86
- end
103
+ #unless opts.key?(:spectra)
104
+ # opts[:spectra] = true
105
+ #end
87
106
 
88
107
  msrun_obj =
89
108
  if x = opts[:msrun]
@@ -92,14 +111,20 @@ class MS::Parser::MzXML::DOM
92
111
  MS::MSRun.new
93
112
  end
94
113
 
95
- root =
96
- if @version == '2.0'
97
- string = fix_bad_scan_tags(file)
98
- get_root_node_from_string(string)
114
+ io =
115
+ if file.is_a? String # a filename
116
+ filename = file
117
+ File.open(file)
99
118
  else
100
- get_root_node_from_file(file)
119
+ file
101
120
  end
102
121
 
122
+ root = get_root_node_from_io(io)
123
+
124
+ if filename
125
+ io.close # can close now
126
+ end
127
+
103
128
  # right now we are only finding the first msRun (probably a rare case of
104
129
  # multiple runs in an mzXML file...)
105
130
  msrun_n =
@@ -118,7 +143,7 @@ class MS::Parser::MzXML::DOM
118
143
  scan_count = msrun_n['scanCount'].to_i
119
144
  msrun_obj.scan_count = scan_count
120
145
  scans_by_num = Array.new(scan_count + 1)
121
-
146
+
122
147
  ## SPECTRUM
123
148
  parent = nil
124
149
  scans = Array.new( scan_count )
@@ -127,17 +152,16 @@ class MS::Parser::MzXML::DOM
127
152
  # we should be able to do this, but it's not working!!!
128
153
  #scan_n = msrun_n.find_first('scan')
129
154
  #while (scn_index < scan_count)
130
- get_spectra = opts[:spectra]
155
+ lazy = opts[:lazy]
131
156
 
132
- msrun_n.each do |scan_n|
133
- next unless scan_n.name == 'scan'
134
- scan = create_scan(scan_n, scans_by_num, get_spectra)
135
- scans[scn_index] = scan
136
- #sc = scan_n.next
137
- scans_by_num[scan[0]] = scan
138
- scn_index += 1
157
+ if @version >= '3.0'
158
+ warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
159
+ # note that mzXML version 3.0 *can* have more than one peak...
160
+ # I'm not sure how to deal with that since I have one spectrum/scan
139
161
  end
140
162
 
163
+ scan_nodes = msrun_n.find('child::scan')
164
+ add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
141
165
 
142
166
  ## update the scan's parents
143
167
  MS::MSRun.add_parent_scan(scans)
@@ -151,9 +175,8 @@ class MS::Parser::MzXML::DOM
151
175
  msrun_obj.end_time = scans.last.time
152
176
 
153
177
  msrun_obj.scans = scans
154
- end
155
178
 
179
+ end
156
180
  end
157
181
 
158
182
 
159
-
@@ -8,7 +8,7 @@ class MS::Parser::MzXML::Hpricot
8
8
  include XMLStyleParser
9
9
  include MS::Parser::MzXML
10
10
 
11
- @@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
11
+ @@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
12
12
 
13
13
  def initialize(parse_type=:msrun, version='1.0')
14
14
  @method = parse_type
@@ -2,12 +2,16 @@
2
2
  require 'ms/parser/mzxml/dom'
3
3
 
4
4
  class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
5
- def goot_root_node_from_string(string)
5
+ def get_root_node_from_string(string)
6
6
  XML::Parser.string(string).parse.root
7
7
  end
8
8
  def get_root_node_from_file(file)
9
- XML::Document.file(file).root
9
+ XML::Parser.filename(file).parse.root
10
10
  end
11
+ def get_root_node_from_io(io)
12
+ XML::Parser.io(io).parse.root
13
+ end
14
+
11
15
  end
12
16
 
13
17
 
@@ -1,4 +1,5 @@
1
1
  require 'ms/msrun'
2
+ require 'fileutils'
2
3
 
3
4
  module MS; end
4
5
 
@@ -7,14 +8,120 @@ module MS::Parser::MzXML
7
8
  # inherits XMLStyleParser and version
8
9
  include MS::Parser
9
10
  include XMLStyleParser
10
-
11
+
12
+ # warning: clobbers file unless a newfilename is provided!
13
+ # returns the output filename
14
+ # will fix any size file!
15
+ def self.fix_bad_scan_tags(filename, newfilename=nil)
16
+
17
+ out_io =
18
+ if newfilename
19
+ File.open(newfilename, 'w')
20
+ else
21
+ Tempfile.new(File.basename(filename))
22
+ end
23
+ File.open(filename) do |fh|
24
+ self.fix_bad_scan_tags_from_io(fh, out_io)
25
+ end
26
+ out_io.close
27
+ unless newfilename
28
+ FileUtils.mv out_io.path, filename
29
+ end
30
+ end
31
+
32
+ # this is a memory efficient method to fix bad scan tags
33
+ # prints cleaned up file to out_io
34
+ # no effort is made to rewind the io objects, the user must do this if they
35
+ # plan to continue using these objects!
36
+ def self.fix_bad_scan_tags_from_io(io, out_io)
37
+ regexp = /<\/scan>/
38
+ end_scan_line = false
39
+
40
+ io.each("\n") do |line|
41
+ if end_scan_line && line =~ regexp
42
+ # two end scan lines! # don't print to out_io
43
+ end_scan_line = true
44
+ elsif line =~ regexp
45
+ out_io.print(line)
46
+ end_scan_line = true
47
+ else
48
+ out_io.print(line)
49
+ end_scan_line = false
50
+ end
51
+ end
52
+ end
53
+
54
+ # returns a string with double </scan></scan> tags into single and missing
55
+ # </scan> tags after peaks added in
56
+ # we do this in windows style since these are generated off a windows
57
+ # machine only
58
+ #def self.fix_bad_scan_tags(string)
59
+ # string.gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
60
+ #end
61
+
62
+ # returns true if it has the bad tag
63
+ def self.has_bad_scan_tag_from_string?(string)
64
+ if string.match(/<\/scan>\s+<\/scan>/m)
65
+ true
66
+ else
67
+ false
68
+ end
69
+ end
70
+
71
+ def self.has_bad_scan_tag?(filename)
72
+ File.open(filename) do |fh|
73
+ self.has_bad_scan_tag_from_io?(fh)
74
+ end
75
+ end
76
+
77
+ # very efficient algorithm to check for malformed xml typical of readw
78
+ # output. The extra closing scan tags come after the last ms/ms scan in a
79
+ # cycle rewinds the io after looking
80
+ def self.has_bad_scan_tag_from_io?(io)
81
+ seen_first_ms_level = false
82
+ seen_higher_ms_level = false
83
+ cur_ms_level = 0
84
+ found_double_end_tag = false
85
+ found_end_tag = false
86
+ io.each("\n") do |line|
87
+ if line =~ /<\/scan>/
88
+ if found_end_tag # already found one!
89
+ found_double_end_tag = true
90
+ break
91
+ end
92
+ found_end_tag = true
93
+ else
94
+ found_end_tag = false
95
+ end
96
+
97
+ if line =~ /msLevel="(\d+)"/
98
+ cur_ms_level = $1.dup
99
+ if seen_first_ms_level && seen_higher_ms_level && cur_ms_level == '1'
100
+ break
101
+ end
102
+ if cur_ms_level == '1'
103
+ seen_first_ms_level = true
104
+ elsif cur_ms_level == '2'
105
+ seen_higher_ms_level = true
106
+ end
107
+ end
108
+ end
109
+ io.rewind
110
+ found_double_end_tag
111
+ end
112
+
11
113
  # returns a specific parser MS::Parser::MzXML::#{ParserType}
12
114
  # based on choose_parser from xml_style_parser
13
- def self.new(parse_type=:msrun, version='1.0')
115
+ def self.new(parse_type=:msrun, version='1.0', opts={})
116
+ special_subclass =
117
+ if opts[:lazy] == :io
118
+ 'LazyPeaks'
119
+ else ; nil
120
+ end
14
121
  @version = version
15
122
  @method = parse_type
16
123
  XMLStyleParser.require_parse_files(Base_dir_for_parsers)
17
- parser_class = XMLStyleParser.choose_parser(self, parse_type)
124
+ parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
18
125
  parser = parser_class.new(parse_type, version)
19
126
  end
20
127