RubyGems - ms-msrun - Versions diffs - 0.0.1 - Mend

ms-msrun 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/LICENSE +16 -0
data/README +62 -0
data/Rakefile +113 -0
data/bin/base64_to_array.rb +13 -0
data/bin/ms_to_obiwarp.rb +69 -0
data/bin/ms_to_search.rb +44 -0
data/lib/bsearch.rb +120 -0
data/lib/lmat.rb +171 -0
data/lib/ms/msrun.rb +297 -0
data/lib/ms/msrun/axml/mzxml.rb +141 -0
data/lib/ms/msrun/search.rb +118 -0
data/lib/ms/precursor.rb +27 -0
data/lib/ms/scan.rb +93 -0
data/lib/ms/spectrum.rb +373 -0
data/lib/ms/spectrum/compare.rb +118 -0
data/lib/ms/spectrum/filter.rb +46 -0
data/spec/lmat_spec.rb +29 -0
data/spec/ms/msrun/search_spec.rb +56 -0
data/spec/ms/msrun_spec.rb +50 -0
data/spec/ms/scan_spec.rb +34 -0
data/spec/ms/spectrum/compare_spec.rb +44 -0
data/spec/ms/spectrum/filter_spec.rb +33 -0
metadata +97 -0

data/LICENSE ADDED

@@ -0,0 +1,16 @@
+Copyright (c) 2006, 2007, 2008 The University of Texas at Austin
+Copyright (c) 2009, University of Colorado at Boulder and Howard Hughes
+Medical Institute
+The above copyright holders are collectively designated "COPYRIGHT HOLDER"
+Software by John T. Prince under the direction of Edward M. Marcotte and
+Natalie Ahn.
+By using this software the USER indicates that he or she has read, understood and will comply with the following:
+COPYRIGHT HOLDER hereby grants USER permission to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of this software and its documentation for any purpose and without fee, provided that a full copy of this notice is included with the software and its documentation.
+Title to copyright this software and its associated documentation shall at all times remain with COPYRIGHT HOLDER. No right is granted to use in advertising, publicity or otherwise any trademark, service mark, or the name of COPYRIGHT HOLDER.
+This software and any associated documentation are provided "as is," and COPYRIGHT HOLDER MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESSED OR IMPLIED, INCLUDING THOSE OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT USE OF THE SOFTWARE, MODIFICATIONS, OR ASSOCIATED DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER INTELLECTUAL PROPERTY RIGHTS OF A THIRD PARTY. COPYRIGHT HOLDER and associated Regents, officers, and employees shall not be liable under any circumstances for any direct, indirect, special, incidental, or consequential damages with respect to any claim by USER or any third party on account of or arising from the use, or inability to use, this software or its associated documentation, even if COPYRIGHT HOLDER has been advised of the possibility of those damages.

data/README ADDED

@@ -0,0 +1,62 @@
+= {ms-msrun}[http://mspire.rubyforge.org/projects/ms-msrun]
+A library for working with LC/MS runs.
+== Examples
+The following example will work on *mzXML*, *mzData* ( and *mzML* files, shortly)!
+    require "ms/msrun"
+    Ms::Msrun.open("file.mzXML") do |ms|
+      ms.start_time       # in seconds
+      ms.end_time         # in seconds
+      ms.scan_count       # number of scans
+      ms.scan_count(1)    # number of MS scans
+      ms.scan_count(2)    # number of MS/MS scans, etc.
+      ms.parent_basename_noext   # "file" (as recorded _in the xml_)
+      ms.filename                # "file.mzXML"
+      ms.scans.each do |scan|
+        scan.num          # scan number
+        scan.ms_level     # ms_level
+        scan.time         # retention time in seconds
+        scan.start_mz     # the first m/z value
+        scan.end_mz       # the last m/z value
+        # Precursor information
+        pr = scan.precursor  # an Ms::Precursor object
+        pr.mz
+        pr.intensity      # does fast binary search if info not already given
+        pr.parent         # the parent scan
+        pr.charge_states  # Array of possible charge states
+        # Spectral information
+        spectrum = scan.spectrum
+        spectrum.mzs          # Array of m/z values
+        spectrum.intensities  # Array of m/z values
+        spectrum.peaks do |mz, inten|
+          puts "#{mz} #{inten}"   # print each peak on own line
+        end
+      end
+    end
+== Features
+[*Fast*] uses xmlparser under the hood.
+[*Unified*] one interface for all formats
+[<b>Lazy evaluation of spectra</b>] By default, reads from IO when data is required.
+[<b>Minimal Dependencies</b>] xmlparser (available for most distros and windows one-click installer) and axml.  Very nearly have supoort for LibXML.
+== Installation
+    gem install ms-msrun
+The library currently relies on xmlparser (though LibXML is close to being
+supported).  After installation of ms-msrun (which will automatically install
+`axml`) issue this command to get instructions on installing xmlparser:
+    ruby -rubygems -e 'require "axml" \
+        puts AXML::Autoload.install_instructions(:xmlparser)'

data/Rakefile ADDED

@@ -0,0 +1,113 @@
+require 'rake'
+require 'rubygems'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+require 'rake/testtask'
+require 'rake/clean'
+require 'fileutils'
+###############################################
+# GLOBAL
+###############################################
+FL = FileList
+NAME = "ms-msrun"
+FU = FileUtils
+readme = "README"
+rdoc_dir = 'rdoc'
+rdoc_extra_includes = [readme, "LICENSE"]
+rdoc_options = ['--main', readme, '--title', NAME, '--line-numbers', '--inline-source']
+lib_files = FL["lib/**/*.rb"]
+dist_files = lib_files + FL[readme, "LICENSE", "Rakefile", "{specs}/**/*"]
+changelog = 'CHANGELOG'
+###############################################
+# DOC
+###############################################
+Rake::RDocTask.new do |rd|
+  rd.rdoc_dir = rdoc_dir
+  rd.main = readme
+  rd.rdoc_files.include( rdoc_extra_includes )
+  rd.rdoc_files.include( lib_files.uniq )
+  rd.options.push( *rdoc_options )
+end
+desc "create and upload docs to server"
+task :upload_docs => [:rdoc] do
+  sh "scp -r #{rdoc_dir}/* jtprince@rubyforge.org:/var/www/gforge-projects/mspire/ms-msrun/"
+end
+###############################################
+# TESTS
+###############################################
+desc 'Default: Run specs.'
+task :default => :spec
+desc 'Run specs.'
+Rake::TestTask.new(:spec) do |t|
+  #t.verbose = true
+  #t.warning = true
+  ENV['TEST'] = ENV['SPEC'] if ENV['SPEC']
+  t.libs = ['lib']
+  t.test_files = Dir.glob( File.join('spec', ENV['pattern'] || '**/*_spec.rb') )
+  #t.options = "-v"
+end
+###############################################
+# PACKAGE / INSTALL / UNINSTALL
+###############################################
+tm = Time.now
+gemspec = Gem::Specification.new do |t|
+  description = 'A library for working with LC/MS runs. Part of mspire.  Has parsers for mzXML v1, 2, and 3, mzData and mzML.  Can convert to commonly desired search output (such as mgf)'
+  summary = "A library for working with LC/MS runs"
+  t.platform = Gem::Platform::RUBY
+  t.name = NAME
+  t.version =  IO.readlines(changelog).grep(/##.*version/).pop.split(/\s+/).last.chomp
+  t.homepage = 'http://mspire.rubyforge.org/projects/ms-msrun'
+  t.rubyforge_project = 'mspire'
+  t.summary = summary
+  t.date = "#{tm.year}-#{tm.month}-#{tm.day}"
+  t.email = "jtprince@gmail.com"
+  t.description = description
+  t.has_rdoc = true
+  t.authors = ["John Prince"]
+  t.files = dist_files
+  t.add_dependency 'axml', '~> 0.0.5'
+  t.add_dependency 'runarray'
+  t.rdoc_options = rdoc_options
+  t.extra_rdoc_files = rdoc_extra_includes
+  t.executables = FL["bin/*"].map {|file| File.basename(file) }
+  t.requirements << 'xmlparser (preferrably) or libxml'
+  t.test_files = FL["spec/**/*_spec.rb"]
+end
+desc "Create packages."
+Rake::GemPackageTask.new(gemspec) do |pkg|
+  #pkg.need_zip = true
+  #pkg.need_tar = true
+end
+task :remove_pkg do
+  FileUtils.rm_rf "pkg"
+end
+task :install => [:reinstall]
+desc "uninstalls the package, packages a fresh one, and installs"
+task :reinstall => [:remove_pkg, :clean, :package] do
+  reply = `#{$gemcmd} list -l #{NAME}`
+  if reply.include?(NAME + " (")
+    %x( #{$gemcmd} uninstall -a -x #{NAME} )
+  end
+  FileUtils.cd("pkg") do
+    cmd = "#{$gemcmd} install #{NAME}*.gem"
+    puts "EXECUTING: #{cmd}"
+    system cmd
+  end
+end

data/bin/base64_to_array.rb ADDED

@@ -0,0 +1,13 @@
+require 'ms/spectrum'
+if ARGV.size == 0
+  puts "usage: #{File.basename(__FILE__)} <base64_string>"
+  puts "outputs the array of values"
+  exit
+end
+precision = 32
+network_order = true
+ar = Ms::Spectrum.base64_to_array(ARGV.shift, precision, network_order)
+puts "[ " + ar.join(", ") + " ]"

data/bin/ms_to_obiwarp.rb ADDED

@@ -0,0 +1,69 @@
+#!/usr/bin/ruby
+require 'rubygems'
+require 'ms/msrun'
+require 'lmat'
+require 'optparse'
+require 'ostruct'
+require 'runarray'
+# defaults:
+opt = {}
+opt[:baseline] = 0.0
+opt[:newext] = ".lmat"
+opt[:inc_mz] = 1.0
+# get options:
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} [options] <msfile> ..."
+  op.separator "input: .mzdata or .mzXML"
+  op.separator ""
+  op.separator "(sums m/z values that round to the same bin)"
+  op.separator ""
+  op.on("--mz-inc N", Float, "m/z increment (def: 1.0)") {|n| opt[:mz_inc] = n.to_f}
+  op.on("--mz-start N", Float, "m/z start (def: start of 1st full scan)") {|n| opt[:start_mz] = n.to_f}
+  op.on("--mz-end N", Float, "m/z end (def: end of 1st full scan)") {|n| opt[:end_mz] = n.to_f}
+  op.on("--baseline N", Float, "value for missing indices (def: #{opt[:baseline]})") {|n| opt[:baseline] = n.to_f}
+  op.on("--ascii", "generates an lmata file instead") {opt[:ascii] = true}
+  op.on("-v", "--verbose") {$VERBOSE = true}
+end
+opts.parse!
+if ARGV.size < 1
+  puts opts
+end
+ARGV.each do |file|
+  Ms::Msrun.open(file) do |msrun|
+    mslevel = 1
+    (start_mz, end_mz) = msrun.start_and_end_mz(mslevel)
+    (times, spectra) = msrun.times_and_spectra(mslevel)
+    args = {
+      :start_mz => start_mz,
+      :end_mz => end_mz,
+      :start_tm => times.first,
+      :end_tm => times.last,
+      :inc_tm => nil,
+    }
+    args.merge!(opt)
+    lmat = Ms::Msrun::Lmat.new.from_times_and_spectra(times, spectra, args)
+    ext = File.extname(file)
+    outfile = file.sub(/#{Regexp.escape(ext)}$/, opt[:newext])
+      if args[:ascii]
+        outfile << "a"
+        lmat.print(outfile)
+      else
+        lmat.write(outfile)
+      end
+    puts("OUTPUT: #{outfile}") if $VERBOSE
+  end
+end

data/bin/ms_to_search.rb ADDED

@@ -0,0 +1,44 @@
+#!/usr/bin/ruby
+require 'rubygems'
+require 'tap'
+module Ms ; end
+class Ms::Msrun ; end
+# Documentation here
+class Ms::Msrun::Search < Tap::Task
+  #config :first_scan, 0, :short => 'F', &c.integer # first scan
+  #config :last_scan, 1e12, :short => 'L', &c.integer  # last scan
+  ## if not determined to be +1, then create these charge states
+  #config( :charge_states, [2,3], :short => 'c') {|v| v.split(',') }
+  #config :bottom_mh, 0, :short => 'B', &c.float # bottom MH+
+  #config :top_mh, -1.0, :short => 'T', &c.float # top MH+
+  #config :min_peaks, 0, :short => 'P', &c.integer # minimum peak count
+  #config :ms_levels, 2..-1, :short => 'M', &c.range  # ms levels to export
+  def process(filename)
+    Ms::Msrun.open(filename) do |ms|
+      ms.to_mgf(ms.filename.chomp(File.extname(ms.filename)))
+    end
+  end
+end
+Ms::Msrun::Search.execute
+# extract_msn.exe -M0.2 -B85 -T4500 -S0 -G1 -I35 -C0 -P2 -D output smallraw.RAW
+  #config :group_mass_tol, 1.4, :short => 'M', &c.float # prec. mass tolerance for grouping
+  #config :bottom_mw, 0.0, :short => 'B', &c.float # bottom MW for data file creation
+  #config :top_mw, 999999.0, :short => 'T', &c.float # top MW for data file creation
+  #config :interm_scans, 0, :short => 'S', &c.integer # allowed num intermediate scans between groups
+  #config :min_group, 1, :short => 'G', &c.integer # minimum # of related grouped scans needed for a .dta file
+  #config :min_ions, 0, :short => 'I', &c.integer # minimum num of ions needed for a .dta file
+  # What the heck is the -P option?? Not listed in the help!
+  # Ahn lab sets this to: 2
+  # config : :short => 'P',

data/lib/bsearch.rb ADDED

@@ -0,0 +1,120 @@
+#
+# Ruby/Bsearch - a binary search library for Ruby.
+#
+# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
+#     All rights reserved.
+#     This is free software with ABSOLUTELY NO WARRANTY.
+#
+# You can redistribute it and/or modify it under the terms of
+# the Ruby's licence.
+#
+# Example:
+#
+#  % irb -r ./bsearch.rb
+#  >> %w(a b c c c d e f).bsearch_first {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b c c c d e f).bsearch_last {|x| x <=> "c"}
+#  => 4
+#  >> %w(a b c e f).bsearch_first {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b e f).bsearch_first {|x| x <=> "c"}
+#  => nil
+#  >> %w(a b e f).bsearch_last {|x| x <=> "c"}
+#  => nil
+#  >> %w(a b e f).bsearch_lower_boundary {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b e f).bsearch_upper_boundary {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b c c c d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...5
+#  >> %w(a b c d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...3
+#  >> %w(a b d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...2
+module Bsearch
+  VERSION = '1.5'
+end
+class Array
+  #
+  # The binary search algorithm is extracted from Jon Bentley's
+  # Programming Pearls 2nd ed. p.93
+  #
+  #
+  # Return the lower boundary. (inside)
+  #
+  def bsearch_lower_boundary (range = 0 ... self.length, &block)
+    lower  = range.first() -1
+    upper = if range.exclude_end? then range.last else range.last + 1 end
+    while lower + 1 != upper
+      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
+      if yield(self[mid]) < 0
+	lower = mid
+      else
+	upper = mid
+      end
+    end
+    return upper
+  end
+  #
+  # This method searches the FIRST occurrence which satisfies a
+  # condition given by a block in binary fashion and return the
+  # index of the first occurrence. Return nil if not found.
+  #
+  def bsearch_first (range = 0 ... self.length, &block)
+    boundary = bsearch_lower_boundary(range, &block)
+    if boundary >= self.length || yield(self[boundary]) != 0
+      return nil
+    else
+      return boundary
+    end
+  end
+  alias bsearch bsearch_first
+  #
+  # Return the upper boundary. (outside)
+  #
+  def bsearch_upper_boundary (range = 0 ... self.length, &block)
+    lower  = range.first() -1
+    upper = if range.exclude_end? then range.last else range.last + 1 end
+    while lower + 1 != upper
+      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
+      if yield(self[mid]) <= 0
+	lower = mid
+      else
+	upper = mid
+      end
+    end
+    return lower + 1 # outside of the matching range.
+  end
+  #
+  # This method searches the LAST occurrence which satisfies a
+  # condition given by a block in binary fashion and return the
+  # index of the last occurrence. Return nil if not found.
+  #
+  def bsearch_last (range = 0 ... self.length, &block)
+    # `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
+    boundary = bsearch_upper_boundary(range, &block) - 1
+    if (boundary <= -1 || yield(self[boundary]) != 0)
+      return nil
+    else
+      return boundary
+    end
+  end
+  #
+  # Return the search result as a Range object.
+  #
+  def bsearch_range (range = 0 ... self.length, &block)
+    lower = bsearch_lower_boundary(range, &block)
+    upper = bsearch_upper_boundary(range, &block)
+    return lower ... upper
+  end
+end

data/lib/lmat.rb ADDED

@@ -0,0 +1,171 @@
+require 'runarray'
+include Runarray
+## Labeled matrix
+class Lmat
+  attr_accessor :mvec
+  attr_accessor :nvec
+  # an array of narray objects
+  attr_accessor :mat
+  ## Takes an array of narray objects
+  def initialize(mat=nil, mvec=nil, nvec=nil)
+    @mat = mat
+    @mvec = mvec
+    @nvec = nvec
+  end
+  def max
+    max = mat[0][0]
+    mat.each do |row|
+      row.each do |v|
+        max = v if v > max
+      end
+    end
+    max
+  end
+  # returns self
+  def from_lmat(file)
+    string = IO.read(file)
+    mdim = string.unpack("i")
+    @mvec = NArray.new(string.unpack("f#{mdim}"))
+    ndim = string.unpack("i")
+    @nvec = NArray.new(string.unpack("f#{ndim}"))
+    rows = []
+    mdim.times do
+      rows << string.unpack("f#{ndim}")
+    end
+    @mat = rows
+    self
+  end
+  # returns self
+  def from_lmata(file)
+    # this can probably be made faster
+    File.open(file) do |io|
+      num_m = io.readline.to_i
+      mline = io.readline.chomp
+      @mvec = NArray.new( mline.split(' ').map {|v| v.to_f } )
+      raise RuntimeError, "bad m vec size" if mvec.size != num_m
+      num_n = io.readline.to_i
+      nline = io.readline.chomp
+      @nvec = NArray.new( nline.split(' ').map {|v| v.to_f } )
+      raise RuntimeError, "bad n vec size" if nvec.size != num_n
+      @mat = NArray.new(num_m)
+      num_m.times do |m|
+        line = io.readline
+        line.chomp!
+        @mat[m] = NArray.new(line.split(' ').map {|v| v.to_f })
+      end
+    end
+    self
+  end
+  # converts raw times and spectrum to a labeled matrix
+  # times is an array (or VecI object)
+  # where each row = [mz,inten,mz,inten...]
+  # takes hash with symbols as keys
+  # if inc_tm is undefined, then times from the times array will be used
+  def from_times_and_spectra(times, spectra, args)
+    opt = {
+      :start_mz => 400.0,
+      :end_mz => 1500.0,
+      :inc_mz => 1.0,
+      :behave_mz => 'sum',
+      :start_tm => 0.0,
+      :end_tm => 3600.0,
+      :inc_tm => nil,
+      :baseline=> 0.0,
+    }
+    opt.merge!(args)
+    unless opt[:start_tm] then opt[:start_tm] = times.first end
+    unless opt[:end_tm] then opt[:end_tm] = times.last end
+    if opt[:inc_tm]
+      raise NotImplementedError, "haven't implemented interpolation in ruby yet! (#{File.basename(__FILE__)}: #{__LINE__})"
+    else ## No interpolation
+      if times.first != opt[:start_tm] || times.last != opt[:end_tm]
+        abort "haven't implemented yet! (#{File.basename(__FILE__)}: #{__LINE__})"
+      else
+        @mvec = NArray.new(times)
+        give_vecs = true
+        vecs = spectra.map do |spectrum|
+          #(mz,inten) = spectrum_to_mz_and_inten(spectrum, VecD)
+          # TODO: Figure out a shallow copy here:
+          # perhaps we'll make spectra Vec objects by default in future and
+          # then we'd be set...
+          mzs = NArray.new(spectrum.mzs)
+          intens = NArray.new(spectrum.intensities)
+          (x,y) = mzs.inc_x(intens, opt[:start_mz], opt[:end_mz], opt[:inc_mz], opt[:baseline], opt[:behave_mz])
+          @nvec = x # ends up being the last one, but that's OK
+          y
+        end
+        @mat = vecs
+      end
+    end
+    self
+  end
+  # outputs vec lengths if set to true
+  def to_s(with_vec_lengths=false)
+    arr = []
+    if with_vec_lengths; arr.push(@mvec.size) end
+    arr.push(@mvec.join(" "))
+    if with_vec_lengths; arr.push(@nvec.size) end
+    arr.push(@nvec.join(" "), @mat.map {|v| v.join(" " ) }.join("\n")).join("\n")
+  end
+  def ==(other)
+     other != nil && self.class == other.class && @nvec == other.nvec && @mvec == other.mvec && @mat == other.mat
+  end
+  # converts a single array of alternating m/z intensity values to two
+  # separate arrays
+  # (maybe implement in Ruby::Inline?)
+  # the answer is given in terms of arrs_as (object of class "arrs_as" must
+  # respond to "[]" and create a certain sized array with arrs_as.new(size))
+  def spectrum_to_mz_and_inten(spectrum, arrs_as=Array)
+    half_size = spectrum.size / 2
+    mzs = arrs_as.new(half_size)
+    intens = arrs_as.new(half_size)
+    mz = true
+    spectrum.each_index do |i|
+      if mz
+        mzs[i/2] = spectrum[i]
+        mz = false
+      else
+        mz = true
+        intens[(i-1)/2] = spectrum[i]
+      end
+    end
+    [mzs, intens]
+  end
+  def write(file=nil)
+    handle = $>
+    if file; handle = File.open(file, "wb") end
+    bin_string = ""
+    bin_string << [@mvec.size].pack("i")
+    bin_string << @mvec.pack("f*")
+    bin_string << [@nvec.size].pack("i")
+    bin_string << @nvec.pack("f*")
+    bin_string << @mat.flatten.pack("f*")
+    handle.print bin_string
+    if file; handle.close end
+  end
+  def print(file=nil)
+    handle = $>
+    if file; handle = File.new(file, "w") end
+    handle.print( self.to_s(true) )
+    #$stdout.print( self.to_s(true) )
+    if file; handle.close end
+  end
+end