RubyGems - mspire - Versions diffs - 0.3.9 → 0.4.2 - Mend

mspire 0.3.9 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

data/INSTALL +24 -7
data/README +15 -13
data/README.rdoc +18 -0
data/Rakefile +50 -14
data/bin/aafreqs.rb +0 -0
data/bin/bioworks2excel.rb +0 -0
data/bin/bioworks_to_pepxml.rb +2 -1
data/bin/bioworks_to_pepxml_gui.rb +0 -0
data/bin/fasta_shaker.rb +0 -0
data/bin/filter_and_validate.rb +0 -0
data/bin/gi2annot.rb +0 -0
data/bin/id_class_anal.rb +0 -0
data/bin/id_precision.rb +0 -0
data/bin/ms_to_lmat.rb +0 -0
data/bin/pepproph_filter.rb +0 -0
data/bin/protein_summary.rb +0 -0
data/bin/protxml2prots_peps.rb +0 -0
data/bin/raw_to_mzXML.rb +3 -3
data/bin/run_percolator.rb +122 -0
data/bin/sqt_group.rb +0 -0
data/bin/srf_group.rb +0 -0
data/changelog.txt +29 -0
data/lib/ms/gradient_program.rb +0 -1
data/lib/ms/msrun.rb +62 -29
data/lib/ms/parser/mzdata/axml.rb +55 -0
data/lib/ms/parser/mzdata/dom.rb +51 -36
data/lib/ms/parser/mzdata.rb +8 -2
data/lib/ms/parser/mzxml/axml.rb +59 -0
data/lib/ms/parser/mzxml/dom.rb +80 -57
data/lib/ms/parser/mzxml/hpricot.rb +1 -1
data/lib/ms/parser/mzxml/libxml.rb +6 -2
data/lib/ms/parser/mzxml.rb +110 -3
data/lib/ms/parser.rb +4 -4
data/lib/ms/precursor.rb +19 -4
data/lib/ms/scan.rb +7 -7
data/lib/ms/spectrum.rb +249 -58
data/lib/mspire.rb +1 -1
data/lib/spec_id/bioworks.rb +2 -2
data/lib/spec_id/precision/filter/cmdline.rb +8 -1
data/lib/spec_id/precision/prob/cmdline.rb +2 -2
data/lib/spec_id/precision/prob.rb +1 -0
data/lib/spec_id/proph/pep_summary.rb +3 -4
data/lib/spec_id/proph/prot_summary.rb +3 -3
data/lib/spec_id/protein_summary.rb +1 -1
data/lib/spec_id/sequest/pepxml.rb +5 -5
data/lib/spec_id/sqt.rb +4 -4
data/lib/spec_id/srf.rb +49 -8
data/lib/spec_id.rb +5 -0
data/lib/xml_style_parser.rb +16 -2
data/script/compile_and_plot_smriti_final.rb +0 -0
data/script/create_little_pepxml.rb +0 -0
data/script/degenerate_peptides.rb +0 -0
data/script/estimate_fpr_by_cysteine.rb +0 -0
data/script/extract_gradient_programs.rb +1 -1
data/script/find_cysteine_background.rb +0 -0
data/script/genuine_tps_and_probs.rb +0 -0
data/script/get_apex_values_rexml.rb +0 -0
data/script/mascot_fix_pepxml.rb +123 -0
data/script/msvis.rb +0 -0
data/script/mzXML2timeIndex.rb +0 -0
data/script/peps_per_bin.rb +0 -0
data/script/prep_dir.rb +0 -0
data/script/simple_protein_digestion.rb +0 -0
data/script/smriti_final_analysis.rb +0 -0
data/script/sqt_to_meta.rb +0 -0
data/script/top_hit_per_scan.rb +0 -0
data/script/toppred_to_yaml.rb +0 -0
data/script/tpp_installer.rb +0 -0
data/specs/bin/prob_validate_spec.rb +5 -2
data/specs/bin/protein_summary_spec.rb +5 -1
data/specs/ms/msrun_spec.rb +176 -133
data/specs/ms/parser_spec.rb +3 -3
data/specs/ms/spectrum_spec.rb +0 -2
data/specs/spec_id/precision/filter_spec.rb +4 -1
data/specs/spec_id/precision/prob_spec.rb +2 -2
data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
data/specs/spec_id/sqt_spec.rb +5 -5
data/specs/spec_id/srf_spec.rb +56 -93
data/specs/spec_id/srf_spec_helper.rb +121 -284
data/specs/spec_id_spec.rb +3 -0
data/specs/transmem/toppred_spec.rb +1 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
metadata +247 -229

data/lib/ms/parser/mzdata/axml.rb CHANGED Viewed

@@ -4,9 +4,64 @@ class MS::Parser::MzData::AXML < MS::Parser::MzData::DOM
   def get_root_node_from_file(file)
     ::AXML.parse_file(file)
   end
+  def get_root_node_from_io(io)
+    ::AXML.parse(io)
+  end
+end
+class MS::Parser::MzData::AXML::LazyData < MS::Parser::MzData::AXML
+  def get_root_node_from_string(string)
+    ::AXML::LazyData.parse(string)
+  end
+  def get_root_node_from_file(file)
+    ::AXML::LazyData.parse_file(file)
+  end
+  def get_root_node_from_io(io)
+    ::AXML::LazyData.parse(io)
+  end
 end
+class AXML::LazyData < AXML
+  # Returns the root node (as Element) or nodes (as Array)
+  def self.parse(stream)
+    parser = ::AXML::XMLParser::LazyData.new
+    parser.parse(stream)
+    parser.root
+  end
+end
+# This parser stores information about where the data (peaks) information is
+# in the file
+# The content of the data node is an array where the first member is the
+# start index and the last member is the number of bytes.  All other members
+# should be ignored.
+class AXML::XMLParser::LazyData < ::AXML::XMLParser
+  def startElement(name, attributes)
+    text =
+      if name == 'data' ; []
+      else ; ''
+      end
+    new_el = ::AXML::El.new(@cur, name, attributes, text, [])
+    # add the new node to the previous parent node
+    @cur.add_node(new_el)
+    # notice the change in @cur node
+    @cur = new_el
+  end
+  def character(data)
+    if @cur.text.is_a? Array
+      @cur.text << byteIndex
+    else
+      @cur.text << data
+    end
+  end
+  def endElement(name)
+    if @cur.text.is_a? Array
+      @cur.text << (byteIndex - @cur.text.first)
+    end
+    @cur = @cur.parent
+  end
+end

data/lib/ms/parser/mzdata/dom.rb CHANGED Viewed

@@ -28,11 +28,7 @@ class MS::Parser::MzData::DOM
   # OPTIONS:
   #   :msrun => MSRun    # use this object instead of creating one
-  #   :spectra => *true|false   # if false don't get spectra
   def msrun(file, opts={})
-    unless opts.key?(:spectra)
-      opts[:spectra] = true
-    end
     msrun_obj =
       if x = opts[:msrun]
         msrun_obj = x
@@ -48,9 +44,18 @@ class MS::Parser::MzData::DOM
     id_to_scan_hash = {}
     #    0   1       2             3       4     5          6
-    # %w(num msLevel retentionTime startMz endMz precursors spectrum)
+    # %w(num msLevel retentionTime startMz endMz precursor spectrum)
+    io =
+      if file.is_a? String
+        filename = file
+        File.open(file)
+      else
+        file
+      end
+    root = get_root_node_from_io(io)
-    root = get_root_node_from_file(file)
     description = root.find_first('child::description')
     bioworks33 = is_bioworks33?(description)
     spectrum_list = description.next
@@ -91,49 +96,57 @@ class MS::Parser::MzData::DOM
         end
         if scan[1] > 1  # precursormz info
           prec_list_n = spec_settings_n.next
-          abort('can only process one precursor m/z right now!') if prec_list_n['count'] != '1'
-          precursors = prec_list_n.find('child::precursor').map do |prec_n|
-            # %w(mz inten parent ms_level parent charge_states)
-            prec = MS::Precursor.new
-            unless bioworks33  # bioworks33 points to the wrong scan!!!
-              prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
-            end
-            prec[3] = prec_n['msLevel'].to_i
-            charges = []
-            prec_n.find('descendant::cvParam').each do |cv_param_n|
-              case cv_param_n['name']
-              when 'MassToChargeRatio'
-                prec[0] = cv_param_n['value'].to_f
-                # find the prec intensity
-                unless bioworks33
-                  prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
-                end
-              when 'ChargeState'
-                charges << cv_param_n['value'].to_i
+          raise RuntimeError, "MSRun objects can only accept 1 precursor" if prec_list_n['count'] != '1'
+          prec_n = prec_list_n.find_first('child::precursor')
+          # %w(mz inten parent ms_level parent charge_states)
+          prec = MS::Precursor.new
+          unless bioworks33  # bioworks33 points to the wrong scan!!!
+            prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
+          end
+          # we're not keeping track of this guy anymore
+          # prec[3] = prec_n['msLevel'].to_i
+          charges = []
+          prec_n.find('descendant::cvParam').each do |cv_param_n|
+            case cv_param_n['name']
+            when 'MassToChargeRatio'
+              prec[0] = cv_param_n['value'].to_f
+              # find the prec intensity
+              unless bioworks33
+                prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
               end
+            when 'ChargeState'
+              charges << cv_param_n['value'].to_i
             end
-            prec[5] = charges
-            prec
           end
-          scan[5] = precursors
+          prec[3] = charges
+          scan[5] = prec
         else  # no precursors
-          scan[5] = []
+          scan[5] = nil
         end
         # here's the one line way of doing it, but it's probably more clear in
         # the loop
         #while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
-        if opts[:spectra]
+        unless opts[:lazy] == :no_spectra
           mz_array_bin_n = nil
           loop do
             mz_array_bin_n = spec_desc_n.next
             break if mz_array_bin_n.name == 'mzArrayBinary'
           end
-          data_n = mz_array_bin_n.child
-          mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
+          mz_data_n = mz_array_bin_n.child
           inten_array_bin_n = mz_array_bin_n.next
-          data_n = inten_array_bin_n.child
-          inten = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
-          scan[6] = MS::Spectrum.new(mz, inten)
+          inten_data_n = inten_array_bin_n.child
+          case opts[:lazy]
+          when :string
+           scan[6] = MS::Spectrum::LazyString.from_base64_pair(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true) )
+          when :io
+            mz_data_n_content = mz_data_n.content
+            i_data_n_content = inten_data_n.content
+            scan[6] = MS::Spectrum::LazyIO.new(io, mz_data_n_content.first, mz_data_n_content.last, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), i_data_n_content.first, i_data_n_content.last, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
+          when :not
+            mz = MS::Spectrum.base64_to_array(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true))
+            inten = MS::Spectrum.base64_to_array(inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
+            scan[6] = MS::Spectrum.new(mz, inten)
+          end
         end
         # set up the next loop
@@ -141,7 +154,7 @@ class MS::Parser::MzData::DOM
       end
     end
     if bioworks33
-      MS::MSRun.add_parent_scan(scans, opts[:spectra])
+      MS::MSRun.add_parent_scan(scans, ((opts[:lazy] == :not) ? true : false))
     end
     msrun_obj.scans = scans
     msrun_obj.scan_count = scans.size
@@ -152,6 +165,8 @@ class MS::Parser::MzData::DOM
     end
     msrun_obj.start_time = msrun_obj.scans.first.time
     msrun_obj.end_time = msrun_obj.scans.last.time
+    io.close if filename
   end
 end

data/lib/ms/parser/mzdata.rb CHANGED Viewed

@@ -11,12 +11,18 @@ module MS::Parser::MzData
   # returns a specific parser MS::Parser::MzXML::#{ParserType}
   # based on choose_parser from xml_style_parser
-  def self.new(parse_type=:msrun, version='1.05')
+  def self.new(parse_type=:msrun, version='1.05', opts={})
+    special_subclass =
+      if opts[:lazy] == :io
+      'LazyData'
+      else ; nil
+      end
     @version = version
     @method = parse_type
     #p self.methods.grep /choose_parser/
     XMLStyleParser.require_parse_files(Base_dir_for_parsers)
-    parser_class = XMLStyleParser.choose_parser(self, parse_type)
+    parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
     parser = parser_class.new(parse_type, version)
   end

data/lib/ms/parser/mzxml/axml.rb CHANGED Viewed

@@ -7,5 +7,64 @@ class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
   def get_root_node_from_file(file)
     ::AXML.parse_file(file)
   end
+  def get_root_node_from_io(io)
+    ::AXML.parse(io)
+  end
+end
+class MS::Parser::MzXML::AXML::LazyPeaks < MS::Parser::MzXML::AXML
+  def get_root_node_from_string(string)
+    ::AXML::LazyPeaks.parse(string)
+  end
+  def get_root_node_from_file(file)
+    ::AXML::LazyPeaks.parse_file(file)
+  end
+  def get_root_node_from_io(io)
+    ::AXML::LazyPeaks.parse(io)
+  end
 end
+class AXML::LazyPeaks < AXML
+  # Returns the root node (as Element) or nodes (as Array)
+  def self.parse(stream)
+    parser = ::AXML::XMLParser::LazyPeaks.new
+    parser.parse(stream)
+    parser.root
+  end
+end
+# This parser stores information about where the peaks information is in the
+# file
+# The content of the peaks node is an array where the first member is the
+# start index and the last member is the number of bytes.  All other members
+# should be ignored.
+class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
+  def startElement(name, attributes)
+    text =
+      if name == 'peaks' ; []
+      else ; ''
+      end
+    new_el = ::AXML::El.new(@cur, name, attributes, text, [])
+    # add the new node to the previous parent node
+    @cur.add_node(new_el)
+    # notice the change in @cur node
+    @cur = new_el
+  end
+  def character(data)
+    if @cur.text.is_a? Array
+      @cur.text << byteIndex
+    else
+      @cur.text << data
+    end
+  end
+  def endElement(name)
+    if @cur.text.is_a? Array
+      @cur.text << (byteIndex - @cur.text.first)
+    end
+    @cur = @cur.parent
+  end
+end

data/lib/ms/parser/mzxml/dom.rb CHANGED Viewed

@@ -1,13 +1,17 @@
 require 'xml_style_parser'
 require 'ms/spectrum'
 require 'ms/scan'
+require 'ms/parser/mzxml'
+require 'tempfile'
 class MS::Parser::MzXML::DOM
   include XMLStyleParser
   include MS::Parser::MzXML
-  #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
+  NetworkOrder = true
+  #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
   def initialize(parse_type=:msrun, version='1.0')
     @method = parse_type
@@ -18,7 +22,9 @@ class MS::Parser::MzXML::DOM
     scan = MS::Scan.new  # array class creates one with 9 positions
     scan[0] = node['num'].to_i
     scan[1] = node['msLevel'].to_i
-    scan[2] = node['retentionTime'][2...-1].to_f
+    if x = node['retentionTime']
+      scan[2] = x[2...-1].to_f
+    end
     if x = node['startMz']
       scan[3] = x.to_f
       scan[4] = node['endMz'].to_f
@@ -26,39 +32,60 @@ class MS::Parser::MzXML::DOM
     scan
   end
+  # assumes that node contains scans and checks any scan nodes for children
+  def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
+    nodes.each do |scan_n|
+      scan = create_scan(scan_n, scans_by_num, lazy, io)
+      scans[scn_index] = scan
+      scans_by_num[scan[0]] = scan
+      scn_index += 1
+      if @version > '1.0'
+        new_nodes = scan_n.find('child::scan')
+        if new_nodes.size > 0
+          scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
+        end
+      end
+    end
+    scn_index
+  end
   # takes a scan node and creates a scan object
   # the parent scan is the one directly above it in mslevel
-  # if the
-  def create_scan(scan_n, scans_by_num, get_spectra=true)
-    if @version < '3.0'
-      scan = new_scan_from_hash(scan_n)
-      precs = []
-      scan_n.each do |node|
-        case node.name
-        when 'precursorMz'
-          # should be able to do this!!!
-          #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
-          prec = MS::Precursor.new
-          prec[1] = node['precursorIntensity'].to_f
-          prec[0] = node.content.to_f
-          if x = node['precursorScanNum']
-            prec[2] = scans_by_num[x.to_i]
-          end
-          precs << prec
-        when 'peaks'
-          next unless get_spectra
+  # lazy must be a symbol from MS::MSRun.new
+  def create_scan(scan_n, scans_by_num, lazy, io=nil)
+    scan = new_scan_from_hash(scan_n)
+    prec = nil
+    scan_n.each do |node|
+      case node.name
+      when 'precursorMz'
+        # should be able to do this!!!
+        #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
+        raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
+        prec = MS::Precursor.new
+        prec[1] = node['precursorIntensity'].to_f
+        prec[0] = node.content.to_f
+        if x = node['precursorScanNum']
+          prec[2] = scans_by_num[x.to_i]
+        end
+      when 'peaks'
+        case lazy
+        when :no_spectra
+          next
+        when :string
+          scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
+        when :io
+          # assumes that parsing was done with a LazyPeaks parser!
+          nc = node.content
+          scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
+        when :not
           # SHOULD be able to do this!!
           #peaks_n = scan_n.find_first('child::peaks')
           scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
         end
       end
-      scan[5] = precs
-      scan
-    else  # for version > 3.0
-      abort 'not supporting version 3.0 just yet'
-      # note that mzXML version 3.0 *can* have more than one peak...
-      # I'm not sure how to deal with that since I have one spectrum/scan
     end
+    scan[5] = prec
+    scan
   end
@@ -67,23 +94,15 @@ class MS::Parser::MzXML::DOM
     raise NotImplementedError
   end
-  # returns a string with double </scan></scan> tags into single and missing
-  # </scan> tags after peaks added in
-  # we do this in windows style since these are generated off a windows
-  # machine only
-  def fix_bad_scan_tags(file)
-    IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n  </scan>\r\n  <scan")
-  end
-  # right now cannot parse multiple runs out of an mzXML version 2 file since
+    # right now cannot parse multiple runs out of an mzXML version 2 file since
   # this is built around a single run per file
   # OPTIONS:
-  #   :msrun => MSRun    # use this object instead of creating one
-  #   :spectra => *true|false   # if false don't get spectra
+  #   :msrun => (an MSRun object)   # use this object instead of creating one
+  #   :lazy => [See MS::MSRun for documentation]
   def msrun(file, opts={})
-    unless opts.key?(:spectra)
-      opts[:spectra] = true
-    end
+    #unless opts.key?(:spectra)
+    #  opts[:spectra] = true
+    #end
     msrun_obj =
       if x = opts[:msrun]
@@ -92,14 +111,20 @@ class MS::Parser::MzXML::DOM
         MS::MSRun.new
       end
-    root =
-      if @version == '2.0'
-        string = fix_bad_scan_tags(file)
-        get_root_node_from_string(string)
+    io =
+      if file.is_a? String  # a filename
+        filename = file
+        File.open(file)
       else
-        get_root_node_from_file(file)
+        file
       end
+    root = get_root_node_from_io(io)
+    if filename
+      io.close  # can close now
+    end
     # right now we are only finding the first msRun (probably a rare case of
     # multiple runs in an mzXML file...)
     msrun_n =
@@ -118,7 +143,7 @@ class MS::Parser::MzXML::DOM
     scan_count = msrun_n['scanCount'].to_i
     msrun_obj.scan_count = scan_count
     scans_by_num = Array.new(scan_count + 1)
     ## SPECTRUM
     parent = nil
     scans = Array.new( scan_count )
@@ -127,17 +152,16 @@ class MS::Parser::MzXML::DOM
     # we should be able to do this, but it's not working!!!
     #scan_n = msrun_n.find_first('scan')
     #while (scn_index < scan_count)
-    get_spectra = opts[:spectra]
+    lazy = opts[:lazy]
-    msrun_n.each do |scan_n|
-      next unless scan_n.name == 'scan'
-      scan = create_scan(scan_n, scans_by_num, get_spectra)
-      scans[scn_index] = scan
-      #sc = scan_n.next
-      scans_by_num[scan[0]] = scan
-      scn_index += 1
+    if @version >= '3.0'
+      warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
+      # note that mzXML version 3.0 *can* have more than one peak...
+      # I'm not sure how to deal with that since I have one spectrum/scan
     end
+    scan_nodes = msrun_n.find('child::scan')
+    add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
     ## update the scan's parents
     MS::MSRun.add_parent_scan(scans)
@@ -151,9 +175,8 @@ class MS::Parser::MzXML::DOM
     msrun_obj.end_time = scans.last.time
     msrun_obj.scans = scans
-  end
+  end
 end

data/lib/ms/parser/mzxml/hpricot.rb CHANGED Viewed

@@ -8,7 +8,7 @@ class MS::Parser::MzXML::Hpricot
   include XMLStyleParser
   include MS::Parser::MzXML
-  @@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
+  @@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
   def initialize(parse_type=:msrun, version='1.0')
     @method = parse_type

data/lib/ms/parser/mzxml/libxml.rb CHANGED Viewed

@@ -2,12 +2,16 @@
 require 'ms/parser/mzxml/dom'
 class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
-  def goot_root_node_from_string(string)
+  def get_root_node_from_string(string)
     XML::Parser.string(string).parse.root
   end
   def get_root_node_from_file(file)
-    XML::Document.file(file).root
+    XML::Parser.filename(file).parse.root
   end
+  def get_root_node_from_io(io)
+    XML::Parser.io(io).parse.root
+  end
 end

data/lib/ms/parser/mzxml.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'ms/msrun'
+require 'fileutils'
 module MS; end
@@ -7,14 +8,120 @@ module MS::Parser::MzXML
   # inherits XMLStyleParser and version
   include MS::Parser
   include XMLStyleParser
+  # warning: clobbers file unless a newfilename is provided!
+  # returns the output filename
+  # will fix any size file!
+  def self.fix_bad_scan_tags(filename, newfilename=nil)
+    out_io =
+      if newfilename
+        File.open(newfilename, 'w')
+      else
+        Tempfile.new(File.basename(filename))
+      end
+    File.open(filename) do |fh|
+      self.fix_bad_scan_tags_from_io(fh, out_io)
+    end
+    out_io.close
+    unless newfilename
+      FileUtils.mv out_io.path, filename
+    end
+  end
+  # this is a memory efficient method to fix bad scan tags
+  # prints cleaned up file to out_io
+  # no effort is made to rewind the io objects, the user must do this if they
+  # plan to continue using these objects!
+  def self.fix_bad_scan_tags_from_io(io, out_io)
+    regexp = /<\/scan>/
+    end_scan_line = false
+    io.each("\n") do |line|
+      if end_scan_line && line =~ regexp
+        # two end scan lines! # don't print to out_io
+        end_scan_line = true
+      elsif line =~ regexp
+        out_io.print(line)
+        end_scan_line = true
+      else
+        out_io.print(line)
+        end_scan_line = false
+      end
+    end
+  end
+  # returns a string with double </scan></scan> tags into single and missing
+  # </scan> tags after peaks added in
+  # we do this in windows style since these are generated off a windows
+  # machine only
+  #def self.fix_bad_scan_tags(string)
+  #  string.gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n  </scan>\r\n  <scan")
+  #end
+  # returns true if it has the bad tag
+  def self.has_bad_scan_tag_from_string?(string)
+    if string.match(/<\/scan>\s+<\/scan>/m)
+      true
+    else
+      false
+    end
+  end
+  def self.has_bad_scan_tag?(filename)
+    File.open(filename) do |fh|
+      self.has_bad_scan_tag_from_io?(fh)
+    end
+  end
+  # very efficient algorithm to check for malformed xml typical of readw
+  # output. The extra closing scan tags come after the last ms/ms scan in a
+  # cycle rewinds the io after looking
+  def self.has_bad_scan_tag_from_io?(io)
+    seen_first_ms_level = false
+    seen_higher_ms_level = false
+    cur_ms_level = 0
+    found_double_end_tag = false
+    found_end_tag = false
+    io.each("\n") do |line|
+      if line =~ /<\/scan>/
+        if found_end_tag  # already found one!
+          found_double_end_tag = true
+          break
+        end
+        found_end_tag = true
+      else
+        found_end_tag = false
+      end
+      if line =~ /msLevel="(\d+)"/
+        cur_ms_level = $1.dup
+        if seen_first_ms_level && seen_higher_ms_level && cur_ms_level == '1'
+          break
+        end
+        if cur_ms_level == '1'
+          seen_first_ms_level = true
+        elsif cur_ms_level == '2'
+          seen_higher_ms_level = true
+        end
+      end
+    end
+    io.rewind
+    found_double_end_tag
+  end
   # returns a specific parser MS::Parser::MzXML::#{ParserType}
   # based on choose_parser from xml_style_parser
-  def self.new(parse_type=:msrun, version='1.0')
+  def self.new(parse_type=:msrun, version='1.0', opts={})
+    special_subclass =
+      if opts[:lazy] == :io
+      'LazyPeaks'
+      else ; nil
+      end
     @version = version
     @method = parse_type
     XMLStyleParser.require_parse_files(Base_dir_for_parsers)
-    parser_class = XMLStyleParser.choose_parser(self, parse_type)
+    parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
     parser = parser_class.new(parse_type, version)
   end