mspire-lipid 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,120 @@
1
+ require 'mspire/mass'
2
+ require 'mspire/molecular_formula'
3
+
4
+ module Mspire
5
+ class Lipid
6
+
7
+ # the convention is all mods are gains unless the name ends in an
8
+ # underscore
9
+ class Modification < Mspire::MolecularFormula
10
+
11
+ # calculates the mass diff. For every positive charge the mass of an
12
+ # electron is subtracted; for every negative charge the mass of an
13
+ # electron is added. If gain is false, then the mass diff will be
14
+ # negative. Formula may be a string.
15
+ def self.massdiff(formula, charge, gain=true)
16
+ massdiff = Mspire::MolecularFormula[formula].mass
17
+ massdiff -= (charge * Mspire::Mass::ELECTRON) # + charge subtracts, - charge adds
18
+ massdiff = -massdiff unless gain
19
+ massdiff
20
+ end
21
+
22
+ # the charge on the mod should be represented by the number of plusses
23
+ # or minuses after the formula (Li+ for a +1 charge Lithium or H2++, 2
24
+ # protons with a total of 2 charges)
25
+ FORMULAS = {
26
+ :proton => 'H',
27
+ :ammonium => 'NH4',
28
+ :lithium => 'Li',
29
+ :sodium => 'Na',
30
+ :water => 'H2O',
31
+ :ammonia => 'NH3',
32
+ :carbon_dioxide => 'CO2',
33
+ :acetate => 'C2H3O2', # OAc- # need to work out negative charge
34
+ }
35
+ CHARGE = {
36
+ :proton => 1,
37
+ :ammonium => 1,
38
+ :lithium => 1,
39
+ :sodium=> 1,
40
+ :water => 0,
41
+ :ammonia => 0,
42
+ :carbon_dioxide => 0,
43
+ :acetate => -1,
44
+ }
45
+
46
+ # determined by running formulas through Mspire::Mass.massdiff
47
+ MASSDIFFS = {}
48
+ FORMULAS.each do |name, formula|
49
+ MASSDIFFS[name] = self.massdiff(formula, CHARGE[name])
50
+ end
51
+
52
+ # as a symbol
53
+ attr_accessor :name
54
+ # a MolecularFormula object
55
+ attr_accessor :formula
56
+ # negative indicates a loss
57
+ attr_accessor :massdiff
58
+ # the charge
59
+ attr_accessor :charge
60
+
61
+ # if no mass or formula is given then it searches command mods for the name
62
+ # @param [Symbol] name the name of the mod
63
+ # A number of opts are expected if they are not found in the FORMULAS,
64
+ # CHARGE, or MASSDIFFS hashes. However, the massdiff will be inferred
65
+ # from the formula if it is not given:
66
+ #
67
+ # attributes:
68
+ # :formula = the chemical formula, lipidmaps style ("C2H4BrO") or
69
+ # any valid argument to MolecularFormula.from_any
70
+ # :massdiff = +/-Float
71
+ # :charge = +/- Integer
72
+ #
73
+ # instruction:
74
+ # :loss = true negates the mass diff sign and charge during initialization
75
+ # this option is typically only done for molecules
76
+ # already present in the FORMULA hash (e.g.)
77
+ #
78
+ # proton_loss = Mspire::Lipid::Modification.new(:proton, :loss => true)
79
+ # water_loss = Mspire::Lipid::Modification.new(:water, :loss => true)
80
+ #
81
+ def initialize(name, opts={})
82
+ @name = name
83
+ @formula =
84
+ if ( form_string = (opts[:formula] || FORMULAS[name]) )
85
+ Mspire::MolecularFormula.from_any( form_string )
86
+ end
87
+ @massdiff = opts[:massdiff] || MASSDIFFS[name]
88
+ @charge = opts[:charge] || CHARGE[name]
89
+
90
+ if opts[:loss]
91
+ @charge = -@charge
92
+ # necessary if you are using a named molecule and you want its loss
93
+ # rather than gain (i.e., you want a negative massdiff)
94
+ @massdiff = -@massdiff
95
+ end
96
+ end
97
+
98
+ def charged_formula_string
99
+ @formula.to_s + @charge.abs.times.map { (@charge > 0) ? '+' : '-' }.join
100
+ end
101
+
102
+ alias_method :to_s, :charged_formula_string
103
+
104
+ def gain?
105
+ massdiff > 0
106
+ end
107
+
108
+ def loss?
109
+ !gain?
110
+ end
111
+
112
+ def inspect
113
+ "<Mod: #{to_s}>"
114
+ end
115
+
116
+ end
117
+ end
118
+ end
119
+
120
+
@@ -0,0 +1,205 @@
1
+ require 'mspire/spectrum'
2
+ require 'rserve/simpler' # TODO: move to integrated interface with rserve when available
3
+ require 'core_ext/array/in_groups'
4
+ require 'mspire/lipid/search/hit'
5
+ require 'mspire/lipid/search/bin'
6
+ require 'mspire/lipid/modification'
7
+ require 'mspire/lipid/search/probability_distribution'
8
+
9
+ module Mspire
10
+ class Lipid
11
+ class Search
12
+ STANDARD_MODIFICATIONS = {
13
+ :proton => [1,2],
14
+ :ammonium => [1],
15
+ :lithium => [1],
16
+ :water => [1,2],
17
+ }
18
+ STANDARD_SEARCH = {
19
+ :units => :ppm,
20
+ :query_min_count_per_bin => 500, # min number of peaks per bin
21
+ :num_rand_samples_per_bin => 1000,
22
+ :num_nearest => 2,
23
+ :return_order => :as_given, # or :sorted
24
+ }
25
+
26
+ attr_accessor :options
27
+ attr_accessor :search_function
28
+
29
+ # will generate PossibleLipid objects and return a new search object
30
+ # uses only one kind of loss at a time and one type of gain at a time
31
+ # will also do the combination of a gain and a loss if gain_and_loss is
32
+ # true
33
+ def self.generate_simple_queries(lipids, mods=STANDARD_MODIFICATIONS, gain_and_loss=false)
34
+ possible_lipids = []
35
+ real_mods_and_cnts = mods.map {|name, cnts| [Mspire::Lipid::Modification.new(name), cnts] }
36
+ # one of each
37
+ real_mods_and_cnts.each do |mod, counts|
38
+ counts.each do |cnt|
39
+ possible_lipids << Mspire::Lipid::Search::Query.new(lipid, Array.new(cnt, mod))
40
+ end
41
+ end
42
+ if gain_and_loss
43
+ # one of each gain + one of each loss
44
+ (gain_mod_cnt_pairs, loss_mod_cnt_pairs) = real_mods_and_cnts.partition {|mod, count| mod.gain }
45
+ gain_mod_cnt_pairs.each do |mod, cnt|
46
+ lipids.each do |lipid|
47
+ #### need to implement still (use combinations or something...)
48
+ get_this_working!
49
+ end
50
+ end
51
+ end
52
+ self.new(possible_lipids)
53
+ end
54
+
55
+ # ions are Mspire::Lipid::Ion objects
56
+ # each one should give a non-nil m/z value
57
+ def initialize(ions=[], opts={})
58
+ @options = STANDARD_SEARCH.merge(opts)
59
+ @db_isobar_spectrum = create_db_isobar_spectrum(ions)
60
+ @search_function = create_search_function(ions, @options)
61
+ end
62
+
63
+ # returns an array of HitGroup and a parallel array of BH derived
64
+ # q-values (will switch to Storey soon enough). The HitGroups are
65
+ # returned in the order in which the mz_values are given.
66
+ # assumes search_queries are in ascending m/z order
67
+ def search(search_queries, opts={})
68
+ opt = @options.merge( opts )
69
+ hit_groups = @search_function.call(search_queries, opt[:num_nearest])
70
+ sorted_hit_groups = qvalues!(hit_groups, opt)
71
+ case opts[:return_order]
72
+ when :given
73
+ hit_groups
74
+ when :sorted
75
+ sorted_hit_groups
76
+ else
77
+ raise ArgumentError, "invalid :return_order"
78
+ end
79
+ end
80
+
81
+ def qvalues!(hit_groups, opts)
82
+
83
+ # from http://stats.stackexchange.com/questions/870/multiple-hypothesis-testing-correction-with-benjamini-hochberg-p-values-or-q-va
84
+ # but I've already coded this up before, too, in multiple ways...
85
+ prev_bh_value = 0
86
+ num_total_tests = hit_groups.size
87
+
88
+ #hit_groups.each {|hg| p [hg.first.pvalue, hg] }
89
+
90
+ # calculate Q-values BH style for now:
91
+ # first hit is the best hit in the group
92
+ pval_hg_index_tuples = hit_groups.each_with_index.map {|hg,i| [hg.pvalue, hg.delta.abs, hg.ppm.abs, i, hg] }
93
+
94
+ if pval_hg_index_tuples.any? {|pair| pair.first.nan? }
95
+ $stderr.puts "pvalue of NaN!"
96
+ $stderr.puts ">>> Consider increasing query_min_count_per_bin or setting ppm to false <<<"
97
+ raise
98
+ end
99
+
100
+ sorted_pval_index_tuples = pval_hg_index_tuples.sort
101
+
102
+ sorted_pval_index_tuples.each_with_index do |tuple,i|
103
+ pval = tuple.first
104
+ bh_value = pval * num_total_tests / (i + 1)
105
+ # Sometimes this correction can give values greater than 1,
106
+ # so we set those values at 1
107
+ bh_value = [bh_value, 1].min
108
+
109
+ # To preserve monotonicity in the values, we take the
110
+ # maximum of the previous value or this one, so that we
111
+ # don't yield a value less than the previous.
112
+ bh_value = [bh_value, prev_bh_value].max
113
+ prev_bh_value = bh_value
114
+ tuple.last.first.qvalue = bh_value # give the top hit the q-value
115
+ end
116
+
117
+ sorted_pval_index_tuples.map(&:last)
118
+ end
119
+
120
+ def create_search_function(ions, opt)
121
+
122
+ db_isobar_spectrum = create_db_isobar_spectrum(ions)
123
+
124
+ search_bins = create_search_bins(db_isobar_spectrum, opt[:query_min_count_per_bin])
125
+
126
+ create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, opt[:num_rand_samples_per_bin], opt[:ppm])
127
+
128
+ # create the actual search function
129
+ # returns an array of hit_groups
130
+ lambda do |search_queries, num_nearest_hits|
131
+ Bin.bin(search_bins, search_queries, &:mz)
132
+ search_bins_with_data = search_bins.reject {|bin| bin.data.empty? }
133
+ hit_groups = search_bins_with_data.map {|bin| bin.queries_to_hit_groups!(opt[:num_nearest]) }.flatten(1)
134
+ end
135
+ end
136
+
137
+ #####################################################
138
+ # Ancillary to create_search_function:
139
+ #####################################################
140
+
141
+ # returns a DB isobar spectrum where the m/z values are all the m/z
142
+ # values to search for and the intensities each an array corresponding
143
+ # to all the lipid ions matching that m/z value
144
+ def create_db_isobar_spectrum(ions)
145
+ mzs = [] ; query_groups = []
146
+ pairs = ions.group_by(&:mz).sort_by(&:first)
147
+ pairs.each {|mz, ar| mzs << mz ; query_groups << ar }
148
+ Mspire::Spectrum.new([mzs, query_groups])
149
+ end
150
+
151
+ # use_ppm uses ppm or amu if false
152
+ # returns the search_bins
153
+ def create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, num_rand_samples_per_bin, use_ppm=true)
154
+ search_bins.each do |search_bin|
155
+ rng = Random.new
156
+ random_mzs = num_rand_samples_per_bin.times.map { rng.rand(search_bin.to_range) }
157
+ # find the deltas
158
+ diffs = random_mzs.map do |random_mz|
159
+ nearest_random_mz = db_isobar_spectrum.find_nearest(random_mz)
160
+ delta = (random_mz - nearest_random_mz).abs
161
+ use_ppm ? delta./(nearest_random_mz).*(1e6) : delta
162
+ end
163
+ search_bin.probability_distribution = ProbabilityDistribution.deviations_to_probability_distribution((use_ppm ? :ppm : :amu), diffs)
164
+ end
165
+ search_bins
166
+ end
167
+
168
+ def create_search_bins(db_isobar_spectrum, min_n_per_bin)
169
+ # make sure we get the right bin size based on the input
170
+ ss = db_isobar_spectrum.mzs.size ; optimal_num_groups = 1
171
+ (1..ss).each do |divisions|
172
+ if (ss.to_f / divisions) >= min_n_per_bin
173
+ optimal_num_groups = divisions
174
+ else ; break
175
+ end
176
+ end
177
+
178
+ mz_ranges = []
179
+ prev = nil
180
+
181
+ groups = db_isobar_spectrum.points.in_groups(optimal_num_groups,false).to_a
182
+
183
+ case groups.size
184
+ when 0
185
+ raise 'I think you need some data in your query spectrum!'
186
+ when 1
187
+ group = groups.first
188
+ [ Mspire::Lipid::Search::Bin.new( Range.new(group.first.first, group.last.first), db_isobar_spectrum ) ]
189
+ else
190
+ search_bins = groups.each_cons(2).map do |points1, points2|
191
+ bin = Mspire::Lipid::Search::Bin.new( Range.new(points1.first.first, points2.first.first, true), db_isobar_spectrum )
192
+ prev = points2
193
+ bin
194
+ end
195
+ _range = Range.new(prev.first.first, prev.last.first)
196
+ search_bins << Mspire::Lipid::Search::Bin.new(_range, db_isobar_spectrum) # inclusive
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+
203
+
204
+
205
+
@@ -0,0 +1,79 @@
1
+ require 'mspire/bin'
2
+
3
+ module Mspire
4
+ class Lipid
5
+ class Search
6
+
7
+ # A Search::Bin is a range that contains the *entire* query spectrum
8
+ # (not just the portion covered by the range). the query spectrum, and
9
+ # a ProbabilityDistribution -- the probability that a peak's delta to
10
+ # nearest peak is that small by chance.
11
+ class Bin < Mspire::Bin
12
+ # the intensity value of the query spectrum should be a query
13
+ attr_accessor :db_spectrum
14
+ attr_accessor :probability_distribution
15
+
16
+ def initialize(range_obj, db_spectrum)
17
+ super(range_obj.begin, range_obj.end, range_obj.exclude_end?)
18
+ @db_spectrum = db_spectrum
19
+ end
20
+
21
+ def <<(query)
22
+ @data << query
23
+ end
24
+
25
+ # returns the nearest num_hits Mspire::Lipid::Search::Hits sorted by delta
26
+ # [with tie going to the lower m/z]
27
+ # searches all queries and removes them from the data queue
28
+ def queries_to_hit_groups!(num_hits=1)
29
+ queries = @data.dup
30
+ @data.clear
31
+
32
+ @db_isobar_groups_by_index = @db_spectrum.intensities
33
+
34
+ hit_groups = queries.map do |query|
35
+ best_hits(query, num_hits)
36
+ end
37
+
38
+ all_top_hits = hit_groups.map(&:first)
39
+
40
+ # updates the pvalues for all the hits
41
+ pvalues = probability_distribution.pvalues( all_top_hits )
42
+ all_top_hits.zip(pvalues) {|hit, pvalue| hit.pvalue = pvalue }
43
+
44
+ hit_groups
45
+ end
46
+
47
+ # returns a HitGroup object
48
+ def best_hits(query, num_hits)
49
+ query_mz = query.mz
50
+ #puts "MZ: #{query_mz}"
51
+ db_mzs = @db_spectrum.mzs
52
+ index = @db_spectrum.find_nearest_index(query_mz)
53
+ _min = index - (num_hits-1)
54
+ (_min >= 0) || (_min = 0)
55
+ _max = index + (num_hits-1)
56
+ (_max < db_mzs.size) || (_max = @db_spectrum - 1)
57
+ delta_index_pairs = (_min.._max).map {|i| [query_mz.-(db_mzs[i]).abs, i] }
58
+ closest_delta_index_pairs = delta_index_pairs.sort
59
+ top_num_hits_delta_index_pairs = closest_delta_index_pairs[0, num_hits]
60
+ top_num_hit_indices = top_num_hits_delta_index_pairs.map(&:last)
61
+ hit_group = top_num_hit_indices.map do |index|
62
+ Hit.new( :db_isobar_group => @db_isobar_groups_by_index[index], :observed_mz => query_mz)
63
+ end
64
+ HitGroup.new(hit_group)
65
+ end
66
+
67
+ def inspect
68
+ "<(#{super}) @db_spectrum(points size)=#{db_spectrum.mzs.size} @probability_distribution=#{probability_distribution}>"
69
+ end
70
+
71
+ def to_range
72
+ Range.new( self.begin, self.end, self.exclude_end? )
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+
@@ -0,0 +1,20 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ # this is a group of Lipid::Ion objects that all have the same (or
6
+ # possibly similar) m/z
7
+ class DBIsobarGroup < Array
8
+ # it is implemented like this so that the isobar group *could* have
9
+ # individuals in it with slightly different m/z values and this coudl
10
+ # still be used as a container. In my current implementation they
11
+ # have exactly the same m/z
12
+ attr_accessor :mz
13
+ def initialize( ar=[], mz=nil)
14
+ @mz = mz if mz
15
+ self.replace(ar)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,79 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ class Hit
6
+ # the db_isobar_group this hit is associated with. Each hit is only
7
+ # associated with a single db_isobar_group!
8
+ attr_accessor :db_isobar_group
9
+ # the experimental m/z value
10
+ attr_accessor :observed_mz
11
+ # the probability the hit is due to random chance
12
+ attr_accessor :pvalue
13
+ # the FDR if the threshold accepts this pvalue. Note that this value
14
+ # is relative to the number of tests performed and not completely
15
+ # intrinsic to the hit itself.
16
+ attr_accessor :qvalue
17
+
18
+ # qvalue derived from decoy testing
19
+ attr_accessor :decoy_qvalue
20
+
21
+ # the probability distribution that can be used to determine its
22
+ # pvalue
23
+ attr_accessor :probability_distribution
24
+
25
+ def initialize(hash={})
26
+ hash.each {|k,v| instance_variable_set("@#{k}", v) }
27
+ end
28
+
29
+ # observed_mz - query m/z
30
+ def delta
31
+ @observed_mz - @db_isobar_group.first.mz.to_f
32
+ end
33
+
34
+ alias_method :amu, :delta
35
+
36
+ # the absolute value of distance from true val
37
+ def delta_abs
38
+ delta.abs
39
+ end
40
+
41
+ # parts per million (divided by theoretical m/z)
42
+ def ppm
43
+ (delta / @db_isobar_group.first.mz) * 1e6
44
+ end
45
+
46
+ def theoretical_mz
47
+ @db_isobar_group.first.mz
48
+ end
49
+
50
+ def inspect
51
+ "<<#{super} -- <ppm=#{ppm} delta=#{delta} theoretical_mz=#{theoretical_mz}>>"
52
+ end
53
+ end
54
+
55
+ # A query that matched multiple items. Each search returns a hit group
56
+ # which consists of the best hits for that experimental m/z. When
57
+ # queried for values like delta or ppm, it will delegate to the first hit.
58
+ # So, in many ways it can be used as a container for hits, but it puts
59
+ # its best face forward.
60
+ class HitGroup < Array
61
+
62
+ # should implement with delegator obviously...
63
+ # should allow setting ???
64
+
65
+ def delta() first.delta end
66
+ def ppm() first.ppm end
67
+ def theoretical_mz() first.theoretical_mz end
68
+ def query_group() first.query_group end
69
+ def observed_mz() first.observed_mz end
70
+ def pvalue() ; first.pvalue end
71
+ def qvalue() ; first.qvalue end
72
+ def decoy_qvalue() ; first.decoy_qvalue end
73
+
74
+ def best_hit() first end
75
+
76
+ end
77
+ end
78
+ end
79
+ end