RubyGems - mspire-lipidomics - Versions diffs - 0.1.4 - Mend

mspire-lipidomics 0.1.4

Files changed (35) hide show

data/.document +5 -0
data/.rspec +1 -0
data/LICENSE +21 -0
data/README.rdoc +18 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/bin/lipidomic-search.rb +199 -0
data/lib/mspire/lipid/ion/fragment.rb +68 -0
data/lib/mspire/lipid/ion.rb +57 -0
data/lib/mspire/lipid/modification.rb +125 -0
data/lib/mspire/lipid/search/bin.rb +79 -0
data/lib/mspire/lipid/search/db_isobar_group.rb +20 -0
data/lib/mspire/lipid/search/hit.rb +79 -0
data/lib/mspire/lipid/search/probability_distribution.rb +50 -0
data/lib/mspire/lipid/search/query.rb +23 -0
data/lib/mspire/lipid/search.rb +205 -0
data/lib/mspire/lipid.rb +19 -0
data/lib/mspire/lipid_maps.rb +87 -0
data/mspire-lipidomics.gemspec +85 -0
data/scratch/OBConversion_methods.txt +47 -0
data/scratch/atom_methods.txt +145 -0
data/scratch/bond_methods.txt +867 -0
data/scratch/mol_methods.txt +183 -0
data/scratch/split_molecules.rb +93 -0
data/script/find_nearest_lipid.rb +114 -0
data/spec/mspire/lipid/ion_spec.rb +83 -0
data/spec/mspire/lipid/modification_spec.rb +41 -0
data/spec/mspire/lipid/search_spec.rb +79 -0
data/spec/mspire/lipid_maps_spec.rb +64 -0
data/spec/mspire/lipid_spec.rb +16 -0
data/spec/spec_helper.rb +13 -0
data/spec/testfiles/lipidmaps_download.tsv +11 -0
data/spec/testfiles/lipidmaps_programmatic_short.tsv +32 -0
data/spec/testfiles/lipidmaps_sd_download.tsv +11 -0
metadata +162 -0

data/lib/mspire/lipid/search/hit.rb ADDED Viewed

@@ -0,0 +1,79 @@
+module Mspire
+  class Lipid
+    class Search
+      class Hit
+        # the db_isobar_group this hit is associated with.  Each hit is only
+        # associated with a single db_isobar_group!
+        attr_accessor :db_isobar_group
+        # the experimental m/z value
+        attr_accessor :observed_mz
+        # the probability the hit is due to random chance
+        attr_accessor :pvalue
+        # the FDR if the threshold accepts this pvalue.  Note that this value
+        # is relative to the number of tests performed and not completely
+        # intrinsic to the hit itself.
+        attr_accessor :qvalue
+        # qvalue derived from decoy testing
+        attr_accessor :decoy_qvalue
+        # the probability distribution that can be used to determine its
+        # pvalue
+        attr_accessor :probability_distribution
+        def initialize(hash={})
+          hash.each {|k,v| instance_variable_set("@#{k}", v) }
+        end
+        # observed_mz - query m/z
+        def delta
+          @observed_mz - @db_isobar_group.first.mz.to_f
+        end
+        alias_method :amu, :delta
+        # the absolute value of distance from true val
+        def delta_abs
+          delta.abs
+        end
+        # parts per million (divided by theoretical m/z)
+        def ppm
+          (delta / @db_isobar_group.first.mz) * 1e6
+        end
+        def theoretical_mz
+          @db_isobar_group.first.mz
+        end
+        def inspect
+          "<<#{super} -- <ppm=#{ppm} delta=#{delta} theoretical_mz=#{theoretical_mz}>>"
+        end
+      end
+      # A query that matched multiple items.  Each search returns a hit group
+      # which consists of the best hits for that experimental m/z.  When
+      # queried for values like delta or ppm, it will delegate to the first hit.
+      # So, in many ways it can be used as a container for hits, but it puts
+      # its best face forward.
+      class HitGroup < Array
+        # should implement with delegator obviously...
+        # should allow setting ???
+        def delta() first.delta end
+        def ppm() first.ppm end
+        def theoretical_mz() first.theoretical_mz end
+        def query_group() first.query_group end
+        def observed_mz() first.observed_mz end
+        def pvalue() ; first.pvalue end
+        def qvalue() ; first.qvalue end
+        def decoy_qvalue() ; first.decoy_qvalue end
+        def best_hit() first end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search/probability_distribution.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module Mspire
+  class Lipid
+    class Search
+      class ProbabilityDistribution
+        DEFAULT_TYPE = :ppm
+        R = Rserve::Simpler.new
+        # takes location, scale and shape parameters
+        attr_accessor :location, :scale, :shape
+        # type is :ppm or :delta_abs
+        attr_accessor :type
+        def initialize(location, scale, shape, type=DEFAULT_TYPE)
+          @location, @scale, @shape = location, scale, shape
+          @type = type
+        end
+        # takes a deviation and returns the pvalue
+        def pvalue(hit)
+          R.converse "pgev(log(#{hit.send(type)}), #{@location}, #{@scale}, #{@shape})"
+        end
+        # same as pvalue, just tries to limit the number of calls to R to
+        # speed things up!
+        def pvalues(hits)
+          deltas = hits.map {|v| v.send(type).abs }
+          reply = R.converse("sapply(r_devs, function(elt) pgev(log(elt), #{@location}, #{@scale}, #{@shape}))", :r_devs => deltas)
+          reply.is_a?(Array) ? reply : [reply]
+        end
+        def self.require_r_library(lib)
+          reply = R.converse "library(#{lib})"
+          unless reply.size > 4  # ~roughly
+            $stderr.puts "The libraries ismev and evd must be installed in your R env!"
+            $stderr.puts "From within R (works best if R is started with sudo or root for installing):"
+            $stderr.puts %Q{install.packages("ismev") ; install.packages("evd")}
+            raise "must have R (rserve) and ismev and evd installed!"
+          end
+        end
+        # returns an EVD object
+        def self.deviations_to_probability_distribution(type, devs)
+          %w(ismev evd).each {|lib| require_r_library(lib) }
+          params = R.converse("m <- gev.fit(log(devs_r))\n c(m$mle[1], m$mle[2], m$mle[3])", :devs_r => devs )
+          self.new(*params, type)
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search/query.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Mspire
+  class Lipid
+    class Search
+      class Query
+        # the experimentally observed lowest mz
+        attr_accessor :mz
+        # the index of search spectrum that the m/z was derived from
+        # this allows for the creation of an isotope envelope starting from a
+        # particular m/z value.
+        attr_accessor :index
+        def initialize(mz, index)
+          @mz, @index = mz, index
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search.rb ADDED Viewed

@@ -0,0 +1,205 @@
+require 'mspire/spectrum'
+require 'rserve/simpler'  # TODO: move to integrated interface with rserve when available
+require 'core_ext/array/in_groups'
+require 'mspire/lipid/search/hit'
+require 'mspire/lipid/search/bin'
+require 'mspire/lipid/modification'
+require 'mspire/lipid/search/probability_distribution'
+module Mspire
+  class Lipid
+    class Search
+      STANDARD_MODIFICATIONS = {
+        :proton => [1,2],
+        :ammonium => [1],
+        :lithium => [1],
+        :water => [1,2],
+      }
+      STANDARD_SEARCH = {
+        :units => :ppm,
+        :query_min_count_per_bin => 500,  # min number of peaks per bin
+        :num_rand_samples_per_bin => 1000,
+        :num_nearest => 2,
+        :return_order => :as_given,  # or :sorted
+      }
+      attr_accessor :options
+      attr_accessor :search_function
+      # will generate PossibleLipid objects and return a new search object
+      # uses only one kind of loss at a time and one type of gain at a time
+      # will also do the combination of a gain and a loss if gain_and_loss is
+      # true
+      def self.generate_simple_queries(lipids, mods=STANDARD_MODIFICATIONS, gain_and_loss=false)
+        possible_lipids = []
+        real_mods_and_cnts = mods.map {|name, cnts| [Mspire::Lipid::Modification.new(name), cnts] }
+        # one of each
+        real_mods_and_cnts.each do |mod, counts|
+          counts.each do |cnt|
+            possible_lipids << Mspire::Lipid::Search::Query.new(lipid, Array.new(cnt, mod))
+          end
+        end
+        if gain_and_loss
+          # one of each gain + one of each loss
+          (gain_mod_cnt_pairs, loss_mod_cnt_pairs) = real_mods_and_cnts.partition {|mod, count| mod.gain }
+          gain_mod_cnt_pairs.each do |mod, cnt|
+            lipids.each do |lipid|
+              #### need to implement still (use combinations or something...)
+              get_this_working!
+            end
+          end
+        end
+        self.new(possible_lipids)
+      end
+      # ions are Mspire::Lipid::Ion objects
+      # each one should give a non-nil m/z value
+      def initialize(ions=[], opts={})
+        @options = STANDARD_SEARCH.merge(opts)
+        @db_isobar_spectrum = create_db_isobar_spectrum(ions)
+        @search_function = create_search_function(ions, @options)
+      end
+      # returns an array of HitGroup and a parallel array of BH derived
+      # q-values (will switch to Storey soon enough).  The HitGroups are
+      # returned in the order in which the mz_values are given.
+      # assumes search_queries are in ascending m/z order
+      def search(search_queries, opts={})
+        opt = @options.merge( opts )
+        hit_groups = @search_function.call(search_queries, opt[:num_nearest])
+        sorted_hit_groups = qvalues!(hit_groups, opt)
+        case opts[:return_order]
+        when :given
+          hit_groups
+        when :sorted
+          sorted_hit_groups
+        else
+          raise ArgumentError, "invalid :return_order"
+        end
+      end
+      def qvalues!(hit_groups, opts)
+        # from http://stats.stackexchange.com/questions/870/multiple-hypothesis-testing-correction-with-benjamini-hochberg-p-values-or-q-va
+        # but I've already coded this up before, too, in multiple ways...
+        prev_bh_value = 0
+        num_total_tests = hit_groups.size
+        #hit_groups.each {|hg| p [hg.first.pvalue, hg] }
+        # calculate Q-values BH style for now:
+        # first hit is the best hit in the group
+        pval_hg_index_tuples = hit_groups.each_with_index.map {|hg,i| [hg.pvalue, hg.delta.abs, hg.ppm.abs, i, hg] }
+        if pval_hg_index_tuples.any? {|pair| pair.first.nan? }
+          $stderr.puts "pvalue of NaN!"
+          $stderr.puts ">>> Consider increasing query_min_count_per_bin or setting ppm to false <<<"
+          raise
+        end
+        sorted_pval_index_tuples = pval_hg_index_tuples.sort
+        sorted_pval_index_tuples.each_with_index do |tuple,i|
+          pval = tuple.first
+          bh_value = pval * num_total_tests / (i + 1)
+          # Sometimes this correction can give values greater than 1,
+          # so we set those values at 1
+          bh_value = [bh_value, 1].min
+          # To preserve monotonicity in the values, we take the
+          # maximum of the previous value or this one, so that we
+          # don't yield a value less than the previous.
+          bh_value = [bh_value, prev_bh_value].max
+          prev_bh_value = bh_value
+          tuple.last.first.qvalue = bh_value # give the top hit the q-value
+        end
+        sorted_pval_index_tuples.map(&:last)
+      end
+      def create_search_function(ions, opt)
+        db_isobar_spectrum = create_db_isobar_spectrum(ions)
+        search_bins = create_search_bins(db_isobar_spectrum, opt[:query_min_count_per_bin])
+        create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, opt[:num_rand_samples_per_bin], opt[:ppm])
+        # create the actual search function
+        # returns an array of hit_groups
+        lambda do |search_queries, num_nearest_hits|
+          Bin.bin(search_bins, search_queries, &:mz)
+          search_bins_with_data = search_bins.reject {|bin| bin.data.empty? }
+          hit_groups = search_bins_with_data.map {|bin| bin.queries_to_hit_groups!(opt[:num_nearest]) }.flatten(1)
+        end
+      end
+      #####################################################
+      # Ancillary to create_search_function:
+      #####################################################
+      # returns a DB isobar spectrum where the m/z values are all the m/z
+      # values to search for and the intensities each an array corresponding
+      # to all the lipid ions matching that m/z value
+      def create_db_isobar_spectrum(ions)
+        mzs = [] ; query_groups = []
+        pairs = ions.group_by(&:mz).sort_by(&:first)
+        pairs.each {|mz, ar| mzs << mz ; query_groups << ar }
+        Mspire::Spectrum.new([mzs, query_groups])
+      end
+      # use_ppm uses ppm or amu if false
+      # returns the search_bins
+      def create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, num_rand_samples_per_bin, use_ppm=true)
+        search_bins.each do |search_bin|
+          rng = Random.new
+          random_mzs = num_rand_samples_per_bin.times.map { rng.rand(search_bin.to_range)  }
+          # find the deltas
+          diffs = random_mzs.map do |random_mz|
+            nearest_random_mz = db_isobar_spectrum.find_nearest(random_mz)
+            delta = (random_mz - nearest_random_mz).abs
+            use_ppm ? delta./(nearest_random_mz).*(1e6) : delta
+          end
+          search_bin.probability_distribution = ProbabilityDistribution.deviations_to_probability_distribution((use_ppm ? :ppm : :amu), diffs)
+        end
+        search_bins
+      end
+      def create_search_bins(db_isobar_spectrum, min_n_per_bin)
+        # make sure we get the right bin size based on the input
+        ss = db_isobar_spectrum.mzs.size ; optimal_num_groups = 1
+        (1..ss).each do |divisions|
+          if  (ss.to_f / divisions) >= min_n_per_bin
+            optimal_num_groups = divisions
+          else ; break
+          end
+        end
+        mz_ranges = []
+        prev = nil
+        groups = db_isobar_spectrum.points.in_groups(optimal_num_groups,false).to_a
+        case groups.size
+        when 0
+          raise 'I think you need some data in your query spectrum!'
+        when 1
+          group = groups.first
+          [ Mspire::Lipid::Search::Bin.new( Range.new(group.first.first, group.last.first), db_isobar_spectrum ) ]
+        else
+          search_bins = groups.each_cons(2).map do |points1, points2|
+            bin = Mspire::Lipid::Search::Bin.new( Range.new(points1.first.first, points2.first.first, true), db_isobar_spectrum )
+            prev = points2
+            bin
+          end
+          _range = Range.new(prev.first.first, prev.last.first)
+          search_bins << Mspire::Lipid::Search::Bin.new(_range, db_isobar_spectrum) # inclusive
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Mspire
+  class Lipid
+    def self.members
+      [:lm_id,:common_name,:systematic_name,:formula,:mass,:category,:main_class,:sub_class,:pubchem_id,:inchi_key,:kegg_id,:chebi_id,:structure]
+    end
+    members.each {|mem| attr_accessor mem }
+    def initialize(*args)
+      (@lm_id,@common_name,@systematic_name,@formula,@mass,@category,@main_class,@sub_class,@pubchem_sid, @inchi_key, @kegg_id, @chebi_id, @structure) = args
+    end
+    def inspect
+      cut_common_name = (common_name.size <= 20) ? common_name : (common_name[0,20]+"...")
+      "<#{lm_id}: #{formula}: #{mass} #{cut_common_name}>"
+    end
+  end
+end

data/lib/mspire/lipid_maps.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require 'mspire/lipid'
+require 'mspire/mass'
+module Mspire
+  module LipidMaps
+    DEFAULTS = {
+      :high_res_mass => true,
+      :rubabel_molecules => false,
+      :molecular_formula_objects => true,
+    }
+    # returns an array of Lipids
+    # if high_res_mass is true (default), then the formula is used to calculate a higher
+    # resolution mass than what is in lipidmaps
+    #
+    #     :high_res_mass => true (ensures that a high res mass is present or calculated)
+    def self.parse_file(lipidmaps_tsv, opts={})
+      require 'rubabel' if opts[:rubabel_molecules]
+      opts = DEFAULTS.merge(opts)
+      io = File.open(lipidmaps_tsv)
+      header = io.readline.split("\t")
+      # the lipidmaps_filetype
+      lm_ft = case header.size
+              when 8
+                :programmatic
+              when 20
+                :download
+              when 21
+                :download_sd
+              end
+      index_mapping =
+        case lm_ft
+        when :programmatic
+          (0...(Mspire::Lipid.members.size)).to_a
+        when :download, :download_sd
+          indices = {
+            :lm_id => 0,
+            :systematic_name => 1,
+            :category => 3,
+            :main_class => 4,
+            :mass => 5,
+            :formula => 6,
+            :pubchem_id => 7,
+            :inchi_key => 8,
+            :common_name => 11,
+            :kegg_id => 12,
+            :chebi_id => 13,
+            :sub_class => 14,
+            :structure => 20,
+          }
+          Mspire::Lipid.members.map {|key| indices[key] }
+        end
+      formula_i = index_mapping[Mspire::Lipid.members.index(:formula)]
+      lipids = io.each_line.map do |line|
+        line.chomp!
+        data = line.split("\t")
+        if data[formula_i] =~ /[A-Z]/  # <- there is a formula!
+          lipid = Mspire::Lipid.new( *index_mapping.map {|i| data[i] } )
+          lipid.mass = lipid.mass.to_f
+          lipid
+        end
+      end.compact
+      if opts.values_at(:molecular_formula_objects, :rubabel_molecules).any? || (opts[:high_res_mass] && lm_ft == :programmatic)
+        lipids.each do |lipid|
+          if opts[:molecular_formula_objects]
+            lipid.formula = Mspire::MolecularFormula.new(lipid.formula)
+          end
+          if lm_ft == :programmatic && opts[:high_res_mass]
+            lipid.mass = Mspire::Mass.formula_to_exact_mass(lipid.formula)
+          end
+          if opts[:rubabel_molecules]
+            lipid.structure = Rubabel::Molecule.from_string(lipid.structure.gsub('|', "\n"), :sdf)
+          end
+        end
+      end
+      lipids
+    end
+  end
+end

data/mspire-lipidomics.gemspec ADDED Viewed

@@ -0,0 +1,85 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "mspire-lipidomics"
+  s.version = "0.1.4"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["John T. Prince"]
+  s.date = "2012-05-10"
+  s.description = "does lipidomics"
+  s.email = "jtprince@gmail.com"
+  s.executables = ["lipidomic-search.rb"]
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+    ".rspec",
+    "LICENSE",
+    "README.rdoc",
+    "Rakefile",
+    "VERSION",
+    "bin/lipidomic-search.rb",
+    "lib/mspire/lipid.rb",
+    "lib/mspire/lipid/ion.rb",
+    "lib/mspire/lipid/ion/fragment.rb",
+    "lib/mspire/lipid/modification.rb",
+    "lib/mspire/lipid/search.rb",
+    "lib/mspire/lipid/search/bin.rb",
+    "lib/mspire/lipid/search/db_isobar_group.rb",
+    "lib/mspire/lipid/search/hit.rb",
+    "lib/mspire/lipid/search/probability_distribution.rb",
+    "lib/mspire/lipid/search/query.rb",
+    "lib/mspire/lipid_maps.rb",
+    "scratch/OBConversion_methods.txt",
+    "scratch/atom_methods.txt",
+    "scratch/bond_methods.txt",
+    "scratch/mol_methods.txt",
+    "scratch/split_molecules.rb",
+    "script/find_nearest_lipid.rb",
+    "spec/mspire/lipid/ion_spec.rb",
+    "spec/mspire/lipid/modification_spec.rb",
+    "spec/mspire/lipid/search_spec.rb",
+    "spec/mspire/lipid_maps_spec.rb",
+    "spec/mspire/lipid_spec.rb",
+    "spec/spec_helper.rb",
+    "spec/testfiles/lipidmaps_download.tsv",
+    "spec/testfiles/lipidmaps_programmatic_short.tsv",
+    "spec/testfiles/lipidmaps_sd_download.tsv"
+  ]
+  s.homepage = "http://github.com/princelab/mspire-lipidomics"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.18"
+  s.summary = "mass spectrometry based lipidomics - especially shotgun lipidomics"
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<mspire>, [">= 0.7.8"])
+      s.add_development_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+    else
+      s.add_dependency(%q<mspire>, [">= 0.7.8"])
+      s.add_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
+      s.add_dependency(%q<rspec>, ["~> 2.3.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<mspire>, [">= 0.7.8"])
+    s.add_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
+    s.add_dependency(%q<rspec>, ["~> 2.3.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+  end
+end

data/scratch/OBConversion_methods.txt ADDED Viewed

@@ -0,0 +1,47 @@
+get_in_stream
+get_out_stream
+set_in_stream
+set_out_stream
+set_in_and_out_formats
+set_in_format
+set_out_format
+get_in_format
+get_out_format
+get_in_filename
+get_in_pos
+get_in_len
+get_title
+get_aux_conv
+set_aux_conv
+is_option
+get_options
+add_option
+remove_option
+set_options
+copy_options
+get_supported_input_format
+get_supported_output_format
+convert
+full_convert
+add_chem_object
+get_chem_object
+is_last
+is_first_input
+set_first_input
+get_output_index
+set_output_index
+set_more_files_to_come
+set_one_object_only
+set_last
+is_last_file
+get_count
+write
+write_string
+write_file
+close_out_file
+read
+read_string
+read_file
+open_in_and_out_files
+report_number_converted
+num_input_objects