RubyGems - mspire-lipid - Versions diffs - 0.2.0 - Mend

mspire-lipid 0.2.0

Files changed (38) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/.gitignore +53 -0
data/.rspec +1 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/README.md +11 -0
data/Rakefile +24 -0
data/bin/lipidomic-search.rb +203 -0
data/lib/mspire/lipid.rb +19 -0
data/lib/mspire/lipid/ion.rb +71 -0
data/lib/mspire/lipid/ion/fragment.rb +68 -0
data/lib/mspire/lipid/modification.rb +120 -0
data/lib/mspire/lipid/search.rb +205 -0
data/lib/mspire/lipid/search/bin.rb +79 -0
data/lib/mspire/lipid/search/db_isobar_group.rb +20 -0
data/lib/mspire/lipid/search/hit.rb +79 -0
data/lib/mspire/lipid/search/probability_distribution.rb +50 -0
data/lib/mspire/lipid/search/query.rb +23 -0
data/lib/mspire/lipid/version.rb +6 -0
data/lib/mspire/lipid_maps.rb +110 -0
data/mspire-lipid.gemspec +38 -0
data/scratch/OBConversion_methods.txt +47 -0
data/scratch/atom_methods.txt +145 -0
data/scratch/bond_methods.txt +867 -0
data/scratch/mol_methods.txt +183 -0
data/scratch/split_molecules.rb +93 -0
data/script/find_nearest_lipid.rb +134 -0
data/spec/mspire/lipid/ion_spec.rb +96 -0
data/spec/mspire/lipid/modification_spec.rb +70 -0
data/spec/mspire/lipid/search_spec.rb +82 -0
data/spec/mspire/lipid_maps_spec.rb +64 -0
data/spec/mspire/lipid_spec.rb +16 -0
data/spec/spec_helper.rb +13 -0
data/spec/testfiles/lipidmaps_download.tsv +11 -0
data/spec/testfiles/lipidmaps_programmatic_short.tsv +32 -0
data/spec/testfiles/lipidmaps_sd_download.tsv +11 -0
metadata +202 -0

data/lib/mspire/lipid/modification.rb ADDED

@@ -0,0 +1,120 @@
+require 'mspire/mass'
+require 'mspire/molecular_formula'
+module Mspire
+  class Lipid
+    # the convention is all mods are gains unless the name ends in an
+    # underscore
+    class Modification < Mspire::MolecularFormula
+      # calculates the mass diff.  For every positive charge the mass of an
+      # electron is subtracted; for every negative charge the mass of an
+      # electron is added.  If gain is false, then the mass diff will be
+      # negative. Formula may be a string.
+      def self.massdiff(formula, charge, gain=true)
+        massdiff = Mspire::MolecularFormula[formula].mass
+        massdiff -= (charge * Mspire::Mass::ELECTRON) # + charge subtracts, - charge adds
+        massdiff = -massdiff unless gain
+        massdiff
+      end
+      # the charge on the mod should be represented by the number of plusses
+      # or minuses after the formula (Li+ for a +1 charge Lithium or H2++, 2
+      # protons with a total of 2 charges)
+      FORMULAS = {
+        :proton => 'H',
+        :ammonium => 'NH4',
+        :lithium => 'Li',
+        :sodium => 'Na',
+        :water => 'H2O',
+        :ammonia => 'NH3',
+        :carbon_dioxide => 'CO2',
+        :acetate => 'C2H3O2',  # OAc-  # need to work out negative charge
+      }
+      CHARGE = {
+        :proton => 1,
+        :ammonium => 1,
+        :lithium => 1,
+        :sodium=> 1,
+        :water => 0,
+        :ammonia => 0,
+        :carbon_dioxide => 0,
+        :acetate => -1,
+      }
+      # determined by running formulas through Mspire::Mass.massdiff
+      MASSDIFFS = {}
+      FORMULAS.each do |name, formula|
+         MASSDIFFS[name] = self.massdiff(formula, CHARGE[name])
+      end
+      # as a symbol
+      attr_accessor :name
+      # a MolecularFormula object
+      attr_accessor :formula
+      # negative indicates a loss
+      attr_accessor :massdiff
+      # the charge
+      attr_accessor :charge
+      # if no mass or formula is given then it searches command mods for the name
+      # @param [Symbol] name the name of the mod
+      # A number of opts are expected if they are not found in the FORMULAS,
+      # CHARGE, or MASSDIFFS hashes.  However, the massdiff will be inferred
+      # from the formula if it is not given:
+      #
+      #     attributes:
+      #     :formula = the chemical formula, lipidmaps style ("C2H4BrO") or
+      #                any valid argument to MolecularFormula.from_any
+      #     :massdiff = +/-Float
+      #     :charge = +/- Integer
+      #
+      #     instruction:
+      #     :loss = true   negates the mass diff sign and charge during initialization
+      #                    this option is typically only done for molecules
+      #                    already present in the FORMULA hash (e.g.)
+      #
+      #     proton_loss = Mspire::Lipid::Modification.new(:proton, :loss => true)
+      #     water_loss = Mspire::Lipid::Modification.new(:water, :loss => true)
+      #
+      def initialize(name, opts={})
+        @name = name
+        @formula =
+          if ( form_string = (opts[:formula] || FORMULAS[name]) )
+            Mspire::MolecularFormula.from_any( form_string )
+          end
+        @massdiff = opts[:massdiff] || MASSDIFFS[name]
+        @charge = opts[:charge] || CHARGE[name]
+        if opts[:loss]
+          @charge = -@charge
+          # necessary if you are using a named molecule and you want its loss
+          # rather than gain (i.e., you want a negative massdiff)
+          @massdiff = -@massdiff
+        end
+      end
+      def charged_formula_string
+        @formula.to_s + @charge.abs.times.map { (@charge > 0) ? '+' : '-' }.join
+      end
+      alias_method :to_s, :charged_formula_string
+      def gain?
+        massdiff > 0
+      end
+      def loss?
+        !gain?
+      end
+      def inspect
+        "<Mod: #{to_s}>"
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search.rb ADDED

@@ -0,0 +1,205 @@
+require 'mspire/spectrum'
+require 'rserve/simpler'  # TODO: move to integrated interface with rserve when available
+require 'core_ext/array/in_groups'
+require 'mspire/lipid/search/hit'
+require 'mspire/lipid/search/bin'
+require 'mspire/lipid/modification'
+require 'mspire/lipid/search/probability_distribution'
+module Mspire
+  class Lipid
+    class Search
+      STANDARD_MODIFICATIONS = {
+        :proton => [1,2],
+        :ammonium => [1],
+        :lithium => [1],
+        :water => [1,2],
+      }
+      STANDARD_SEARCH = {
+        :units => :ppm,
+        :query_min_count_per_bin => 500,  # min number of peaks per bin
+        :num_rand_samples_per_bin => 1000,
+        :num_nearest => 2,
+        :return_order => :as_given,  # or :sorted
+      }
+      attr_accessor :options
+      attr_accessor :search_function
+      # will generate PossibleLipid objects and return a new search object
+      # uses only one kind of loss at a time and one type of gain at a time
+      # will also do the combination of a gain and a loss if gain_and_loss is
+      # true
+      def self.generate_simple_queries(lipids, mods=STANDARD_MODIFICATIONS, gain_and_loss=false)
+        possible_lipids = []
+        real_mods_and_cnts = mods.map {|name, cnts| [Mspire::Lipid::Modification.new(name), cnts] }
+        # one of each
+        real_mods_and_cnts.each do |mod, counts|
+          counts.each do |cnt|
+            possible_lipids << Mspire::Lipid::Search::Query.new(lipid, Array.new(cnt, mod))
+          end
+        end
+        if gain_and_loss
+          # one of each gain + one of each loss
+          (gain_mod_cnt_pairs, loss_mod_cnt_pairs) = real_mods_and_cnts.partition {|mod, count| mod.gain }
+          gain_mod_cnt_pairs.each do |mod, cnt|
+            lipids.each do |lipid|
+              #### need to implement still (use combinations or something...)
+              get_this_working!
+            end
+          end
+        end
+        self.new(possible_lipids)
+      end
+      # ions are Mspire::Lipid::Ion objects
+      # each one should give a non-nil m/z value
+      def initialize(ions=[], opts={})
+        @options = STANDARD_SEARCH.merge(opts)
+        @db_isobar_spectrum = create_db_isobar_spectrum(ions)
+        @search_function = create_search_function(ions, @options)
+      end
+      # returns an array of HitGroup and a parallel array of BH derived
+      # q-values (will switch to Storey soon enough).  The HitGroups are
+      # returned in the order in which the mz_values are given.
+      # assumes search_queries are in ascending m/z order
+      def search(search_queries, opts={})
+        opt = @options.merge( opts )
+        hit_groups = @search_function.call(search_queries, opt[:num_nearest])
+        sorted_hit_groups = qvalues!(hit_groups, opt)
+        case opts[:return_order]
+        when :given
+          hit_groups
+        when :sorted
+          sorted_hit_groups
+        else
+          raise ArgumentError, "invalid :return_order"
+        end
+      end
+      def qvalues!(hit_groups, opts)
+        # from http://stats.stackexchange.com/questions/870/multiple-hypothesis-testing-correction-with-benjamini-hochberg-p-values-or-q-va
+        # but I've already coded this up before, too, in multiple ways...
+        prev_bh_value = 0
+        num_total_tests = hit_groups.size
+        #hit_groups.each {|hg| p [hg.first.pvalue, hg] }
+        # calculate Q-values BH style for now:
+        # first hit is the best hit in the group
+        pval_hg_index_tuples = hit_groups.each_with_index.map {|hg,i| [hg.pvalue, hg.delta.abs, hg.ppm.abs, i, hg] }
+        if pval_hg_index_tuples.any? {|pair| pair.first.nan? }
+          $stderr.puts "pvalue of NaN!"
+          $stderr.puts ">>> Consider increasing query_min_count_per_bin or setting ppm to false <<<"
+          raise
+        end
+        sorted_pval_index_tuples = pval_hg_index_tuples.sort
+        sorted_pval_index_tuples.each_with_index do |tuple,i|
+          pval = tuple.first
+          bh_value = pval * num_total_tests / (i + 1)
+          # Sometimes this correction can give values greater than 1,
+          # so we set those values at 1
+          bh_value = [bh_value, 1].min
+          # To preserve monotonicity in the values, we take the
+          # maximum of the previous value or this one, so that we
+          # don't yield a value less than the previous.
+          bh_value = [bh_value, prev_bh_value].max
+          prev_bh_value = bh_value
+          tuple.last.first.qvalue = bh_value # give the top hit the q-value
+        end
+        sorted_pval_index_tuples.map(&:last)
+      end
+      def create_search_function(ions, opt)
+        db_isobar_spectrum = create_db_isobar_spectrum(ions)
+        search_bins = create_search_bins(db_isobar_spectrum, opt[:query_min_count_per_bin])
+        create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, opt[:num_rand_samples_per_bin], opt[:ppm])
+        # create the actual search function
+        # returns an array of hit_groups
+        lambda do |search_queries, num_nearest_hits|
+          Bin.bin(search_bins, search_queries, &:mz)
+          search_bins_with_data = search_bins.reject {|bin| bin.data.empty? }
+          hit_groups = search_bins_with_data.map {|bin| bin.queries_to_hit_groups!(opt[:num_nearest]) }.flatten(1)
+        end
+      end
+      #####################################################
+      # Ancillary to create_search_function:
+      #####################################################
+      # returns a DB isobar spectrum where the m/z values are all the m/z
+      # values to search for and the intensities each an array corresponding
+      # to all the lipid ions matching that m/z value
+      def create_db_isobar_spectrum(ions)
+        mzs = [] ; query_groups = []
+        pairs = ions.group_by(&:mz).sort_by(&:first)
+        pairs.each {|mz, ar| mzs << mz ; query_groups << ar }
+        Mspire::Spectrum.new([mzs, query_groups])
+      end
+      # use_ppm uses ppm or amu if false
+      # returns the search_bins
+      def create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, num_rand_samples_per_bin, use_ppm=true)
+        search_bins.each do |search_bin|
+          rng = Random.new
+          random_mzs = num_rand_samples_per_bin.times.map { rng.rand(search_bin.to_range)  }
+          # find the deltas
+          diffs = random_mzs.map do |random_mz|
+            nearest_random_mz = db_isobar_spectrum.find_nearest(random_mz)
+            delta = (random_mz - nearest_random_mz).abs
+            use_ppm ? delta./(nearest_random_mz).*(1e6) : delta
+          end
+          search_bin.probability_distribution = ProbabilityDistribution.deviations_to_probability_distribution((use_ppm ? :ppm : :amu), diffs)
+        end
+        search_bins
+      end
+      def create_search_bins(db_isobar_spectrum, min_n_per_bin)
+        # make sure we get the right bin size based on the input
+        ss = db_isobar_spectrum.mzs.size ; optimal_num_groups = 1
+        (1..ss).each do |divisions|
+          if  (ss.to_f / divisions) >= min_n_per_bin
+            optimal_num_groups = divisions
+          else ; break
+          end
+        end
+        mz_ranges = []
+        prev = nil
+        groups = db_isobar_spectrum.points.in_groups(optimal_num_groups,false).to_a
+        case groups.size
+        when 0
+          raise 'I think you need some data in your query spectrum!'
+        when 1
+          group = groups.first
+          [ Mspire::Lipid::Search::Bin.new( Range.new(group.first.first, group.last.first), db_isobar_spectrum ) ]
+        else
+          search_bins = groups.each_cons(2).map do |points1, points2|
+            bin = Mspire::Lipid::Search::Bin.new( Range.new(points1.first.first, points2.first.first, true), db_isobar_spectrum )
+            prev = points2
+            bin
+          end
+          _range = Range.new(prev.first.first, prev.last.first)
+          search_bins << Mspire::Lipid::Search::Bin.new(_range, db_isobar_spectrum) # inclusive
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search/bin.rb ADDED

@@ -0,0 +1,79 @@
+require 'mspire/bin'
+module Mspire
+  class Lipid
+    class Search
+      # A Search::Bin is a range that contains the *entire* query spectrum
+      # (not just the portion covered by the range).  the query spectrum, and
+      # a ProbabilityDistribution -- the probability that a peak's delta to
+      # nearest peak is that small by chance.
+      class Bin < Mspire::Bin
+        # the intensity value of the query spectrum should be a query
+        attr_accessor :db_spectrum
+        attr_accessor :probability_distribution
+        def initialize(range_obj, db_spectrum)
+          super(range_obj.begin, range_obj.end, range_obj.exclude_end?)
+          @db_spectrum = db_spectrum
+        end
+        def <<(query)
+          @data << query
+        end
+        # returns the nearest num_hits Mspire::Lipid::Search::Hits sorted by delta
+        # [with tie going to the lower m/z]
+        # searches all queries and removes them from the data queue
+        def queries_to_hit_groups!(num_hits=1)
+          queries = @data.dup
+          @data.clear
+          @db_isobar_groups_by_index = @db_spectrum.intensities
+          hit_groups = queries.map do |query|
+            best_hits(query, num_hits)
+          end
+          all_top_hits = hit_groups.map(&:first)
+          # updates the pvalues for all the hits
+          pvalues = probability_distribution.pvalues( all_top_hits )
+          all_top_hits.zip(pvalues) {|hit, pvalue| hit.pvalue = pvalue }
+          hit_groups
+        end
+        # returns a HitGroup object
+        def best_hits(query, num_hits)
+          query_mz = query.mz
+          #puts "MZ: #{query_mz}"
+          db_mzs = @db_spectrum.mzs
+          index = @db_spectrum.find_nearest_index(query_mz)
+          _min = index - (num_hits-1)
+          (_min >= 0) || (_min = 0)
+          _max = index + (num_hits-1)
+          (_max < db_mzs.size) || (_max = @db_spectrum - 1)
+          delta_index_pairs = (_min.._max).map {|i| [query_mz.-(db_mzs[i]).abs, i] }
+          closest_delta_index_pairs = delta_index_pairs.sort
+          top_num_hits_delta_index_pairs = closest_delta_index_pairs[0, num_hits]
+          top_num_hit_indices = top_num_hits_delta_index_pairs.map(&:last)
+          hit_group = top_num_hit_indices.map do |index|
+            Hit.new( :db_isobar_group => @db_isobar_groups_by_index[index], :observed_mz => query_mz)
+          end
+          HitGroup.new(hit_group)
+        end
+        def inspect
+          "<(#{super}) @db_spectrum(points size)=#{db_spectrum.mzs.size} @probability_distribution=#{probability_distribution}>"
+        end
+        def to_range
+          Range.new( self.begin, self.end, self.exclude_end? )
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search/db_isobar_group.rb ADDED

@@ -0,0 +1,20 @@
+module Mspire
+  class Lipid
+    class Search
+      # this is a group of Lipid::Ion objects that all have the same (or
+      # possibly similar) m/z
+      class DBIsobarGroup < Array
+        # it is implemented like this so that the isobar group *could* have
+        # individuals in it with slightly different m/z values and this coudl
+        # still be used as a container.  In my current implementation they
+        # have exactly the same m/z
+        attr_accessor :mz
+        def initialize( ar=[], mz=nil)
+          @mz = mz if mz
+          self.replace(ar)
+        end
+      end
+    end
+  end
+end

data/lib/mspire/lipid/search/hit.rb ADDED

@@ -0,0 +1,79 @@
+module Mspire
+  class Lipid
+    class Search
+      class Hit
+        # the db_isobar_group this hit is associated with.  Each hit is only
+        # associated with a single db_isobar_group!
+        attr_accessor :db_isobar_group
+        # the experimental m/z value
+        attr_accessor :observed_mz
+        # the probability the hit is due to random chance
+        attr_accessor :pvalue
+        # the FDR if the threshold accepts this pvalue.  Note that this value
+        # is relative to the number of tests performed and not completely
+        # intrinsic to the hit itself.
+        attr_accessor :qvalue
+        # qvalue derived from decoy testing
+        attr_accessor :decoy_qvalue
+        # the probability distribution that can be used to determine its
+        # pvalue
+        attr_accessor :probability_distribution
+        def initialize(hash={})
+          hash.each {|k,v| instance_variable_set("@#{k}", v) }
+        end
+        # observed_mz - query m/z
+        def delta
+          @observed_mz - @db_isobar_group.first.mz.to_f
+        end
+        alias_method :amu, :delta
+        # the absolute value of distance from true val
+        def delta_abs
+          delta.abs
+        end
+        # parts per million (divided by theoretical m/z)
+        def ppm
+          (delta / @db_isobar_group.first.mz) * 1e6
+        end
+        def theoretical_mz
+          @db_isobar_group.first.mz
+        end
+        def inspect
+          "<<#{super} -- <ppm=#{ppm} delta=#{delta} theoretical_mz=#{theoretical_mz}>>"
+        end
+      end
+      # A query that matched multiple items.  Each search returns a hit group
+      # which consists of the best hits for that experimental m/z.  When
+      # queried for values like delta or ppm, it will delegate to the first hit.
+      # So, in many ways it can be used as a container for hits, but it puts
+      # its best face forward.
+      class HitGroup < Array
+        # should implement with delegator obviously...
+        # should allow setting ???
+        def delta() first.delta end
+        def ppm() first.ppm end
+        def theoretical_mz() first.theoretical_mz end
+        def query_group() first.query_group end
+        def observed_mz() first.observed_mz end
+        def pvalue() ; first.pvalue end
+        def qvalue() ; first.qvalue end
+        def decoy_qvalue() ; first.decoy_qvalue end
+        def best_hit() first end
+      end
+    end
+  end
+end