mspire-lipid 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,120 @@
1
+ require 'mspire/mass'
2
+ require 'mspire/molecular_formula'
3
+
4
+ module Mspire
5
+ class Lipid
6
+
7
+ # the convention is all mods are gains unless the name ends in an
8
+ # underscore
9
+ class Modification < Mspire::MolecularFormula
10
+
11
+ # calculates the mass diff. For every positive charge the mass of an
12
+ # electron is subtracted; for every negative charge the mass of an
13
+ # electron is added. If gain is false, then the mass diff will be
14
+ # negative. Formula may be a string.
15
+ def self.massdiff(formula, charge, gain=true)
16
+ massdiff = Mspire::MolecularFormula[formula].mass
17
+ massdiff -= (charge * Mspire::Mass::ELECTRON) # + charge subtracts, - charge adds
18
+ massdiff = -massdiff unless gain
19
+ massdiff
20
+ end
21
+
22
+ # the charge on the mod should be represented by the number of plusses
23
+ # or minuses after the formula (Li+ for a +1 charge Lithium or H2++, 2
24
+ # protons with a total of 2 charges)
25
+ FORMULAS = {
26
+ :proton => 'H',
27
+ :ammonium => 'NH4',
28
+ :lithium => 'Li',
29
+ :sodium => 'Na',
30
+ :water => 'H2O',
31
+ :ammonia => 'NH3',
32
+ :carbon_dioxide => 'CO2',
33
+ :acetate => 'C2H3O2', # OAc- # need to work out negative charge
34
+ }
35
+ CHARGE = {
36
+ :proton => 1,
37
+ :ammonium => 1,
38
+ :lithium => 1,
39
+ :sodium=> 1,
40
+ :water => 0,
41
+ :ammonia => 0,
42
+ :carbon_dioxide => 0,
43
+ :acetate => -1,
44
+ }
45
+
46
+ # determined by running formulas through Mspire::Mass.massdiff
47
+ MASSDIFFS = {}
48
+ FORMULAS.each do |name, formula|
49
+ MASSDIFFS[name] = self.massdiff(formula, CHARGE[name])
50
+ end
51
+
52
+ # as a symbol
53
+ attr_accessor :name
54
+ # a MolecularFormula object
55
+ attr_accessor :formula
56
+ # negative indicates a loss
57
+ attr_accessor :massdiff
58
+ # the charge
59
+ attr_accessor :charge
60
+
61
+ # if no mass or formula is given then it searches command mods for the name
62
+ # @param [Symbol] name the name of the mod
63
+ # A number of opts are expected if they are not found in the FORMULAS,
64
+ # CHARGE, or MASSDIFFS hashes. However, the massdiff will be inferred
65
+ # from the formula if it is not given:
66
+ #
67
+ # attributes:
68
+ # :formula = the chemical formula, lipidmaps style ("C2H4BrO") or
69
+ # any valid argument to MolecularFormula.from_any
70
+ # :massdiff = +/-Float
71
+ # :charge = +/- Integer
72
+ #
73
+ # instruction:
74
+ # :loss = true negates the mass diff sign and charge during initialization
75
+ # this option is typically only done for molecules
76
+ # already present in the FORMULA hash (e.g.)
77
+ #
78
+ # proton_loss = Mspire::Lipid::Modification.new(:proton, :loss => true)
79
+ # water_loss = Mspire::Lipid::Modification.new(:water, :loss => true)
80
+ #
81
+ def initialize(name, opts={})
82
+ @name = name
83
+ @formula =
84
+ if ( form_string = (opts[:formula] || FORMULAS[name]) )
85
+ Mspire::MolecularFormula.from_any( form_string )
86
+ end
87
+ @massdiff = opts[:massdiff] || MASSDIFFS[name]
88
+ @charge = opts[:charge] || CHARGE[name]
89
+
90
+ if opts[:loss]
91
+ @charge = -@charge
92
+ # necessary if you are using a named molecule and you want its loss
93
+ # rather than gain (i.e., you want a negative massdiff)
94
+ @massdiff = -@massdiff
95
+ end
96
+ end
97
+
98
+ def charged_formula_string
99
+ @formula.to_s + @charge.abs.times.map { (@charge > 0) ? '+' : '-' }.join
100
+ end
101
+
102
+ alias_method :to_s, :charged_formula_string
103
+
104
+ def gain?
105
+ massdiff > 0
106
+ end
107
+
108
+ def loss?
109
+ !gain?
110
+ end
111
+
112
+ def inspect
113
+ "<Mod: #{to_s}>"
114
+ end
115
+
116
+ end
117
+ end
118
+ end
119
+
120
+
@@ -0,0 +1,205 @@
1
+ require 'mspire/spectrum'
2
+ require 'rserve/simpler' # TODO: move to integrated interface with rserve when available
3
+ require 'core_ext/array/in_groups'
4
+ require 'mspire/lipid/search/hit'
5
+ require 'mspire/lipid/search/bin'
6
+ require 'mspire/lipid/modification'
7
+ require 'mspire/lipid/search/probability_distribution'
8
+
9
+ module Mspire
10
+ class Lipid
11
+ class Search
12
+ STANDARD_MODIFICATIONS = {
13
+ :proton => [1,2],
14
+ :ammonium => [1],
15
+ :lithium => [1],
16
+ :water => [1,2],
17
+ }
18
+ STANDARD_SEARCH = {
19
+ :units => :ppm,
20
+ :query_min_count_per_bin => 500, # min number of peaks per bin
21
+ :num_rand_samples_per_bin => 1000,
22
+ :num_nearest => 2,
23
+ :return_order => :as_given, # or :sorted
24
+ }
25
+
26
+ attr_accessor :options
27
+ attr_accessor :search_function
28
+
29
+ # will generate PossibleLipid objects and return a new search object
30
+ # uses only one kind of loss at a time and one type of gain at a time
31
+ # will also do the combination of a gain and a loss if gain_and_loss is
32
+ # true
33
+ def self.generate_simple_queries(lipids, mods=STANDARD_MODIFICATIONS, gain_and_loss=false)
34
+ possible_lipids = []
35
+ real_mods_and_cnts = mods.map {|name, cnts| [Mspire::Lipid::Modification.new(name), cnts] }
36
+ # one of each
37
+ real_mods_and_cnts.each do |mod, counts|
38
+ counts.each do |cnt|
39
+ possible_lipids << Mspire::Lipid::Search::Query.new(lipid, Array.new(cnt, mod))
40
+ end
41
+ end
42
+ if gain_and_loss
43
+ # one of each gain + one of each loss
44
+ (gain_mod_cnt_pairs, loss_mod_cnt_pairs) = real_mods_and_cnts.partition {|mod, count| mod.gain }
45
+ gain_mod_cnt_pairs.each do |mod, cnt|
46
+ lipids.each do |lipid|
47
+ #### need to implement still (use combinations or something...)
48
+ get_this_working!
49
+ end
50
+ end
51
+ end
52
+ self.new(possible_lipids)
53
+ end
54
+
55
+ # ions are Mspire::Lipid::Ion objects
56
+ # each one should give a non-nil m/z value
57
+ def initialize(ions=[], opts={})
58
+ @options = STANDARD_SEARCH.merge(opts)
59
+ @db_isobar_spectrum = create_db_isobar_spectrum(ions)
60
+ @search_function = create_search_function(ions, @options)
61
+ end
62
+
63
+ # returns an array of HitGroup and a parallel array of BH derived
64
+ # q-values (will switch to Storey soon enough). The HitGroups are
65
+ # returned in the order in which the mz_values are given.
66
+ # assumes search_queries are in ascending m/z order
67
+ def search(search_queries, opts={})
68
+ opt = @options.merge( opts )
69
+ hit_groups = @search_function.call(search_queries, opt[:num_nearest])
70
+ sorted_hit_groups = qvalues!(hit_groups, opt)
71
+ case opts[:return_order]
72
+ when :given
73
+ hit_groups
74
+ when :sorted
75
+ sorted_hit_groups
76
+ else
77
+ raise ArgumentError, "invalid :return_order"
78
+ end
79
+ end
80
+
81
+ def qvalues!(hit_groups, opts)
82
+
83
+ # from http://stats.stackexchange.com/questions/870/multiple-hypothesis-testing-correction-with-benjamini-hochberg-p-values-or-q-va
84
+ # but I've already coded this up before, too, in multiple ways...
85
+ prev_bh_value = 0
86
+ num_total_tests = hit_groups.size
87
+
88
+ #hit_groups.each {|hg| p [hg.first.pvalue, hg] }
89
+
90
+ # calculate Q-values BH style for now:
91
+ # first hit is the best hit in the group
92
+ pval_hg_index_tuples = hit_groups.each_with_index.map {|hg,i| [hg.pvalue, hg.delta.abs, hg.ppm.abs, i, hg] }
93
+
94
+ if pval_hg_index_tuples.any? {|pair| pair.first.nan? }
95
+ $stderr.puts "pvalue of NaN!"
96
+ $stderr.puts ">>> Consider increasing query_min_count_per_bin or setting ppm to false <<<"
97
+ raise
98
+ end
99
+
100
+ sorted_pval_index_tuples = pval_hg_index_tuples.sort
101
+
102
+ sorted_pval_index_tuples.each_with_index do |tuple,i|
103
+ pval = tuple.first
104
+ bh_value = pval * num_total_tests / (i + 1)
105
+ # Sometimes this correction can give values greater than 1,
106
+ # so we set those values at 1
107
+ bh_value = [bh_value, 1].min
108
+
109
+ # To preserve monotonicity in the values, we take the
110
+ # maximum of the previous value or this one, so that we
111
+ # don't yield a value less than the previous.
112
+ bh_value = [bh_value, prev_bh_value].max
113
+ prev_bh_value = bh_value
114
+ tuple.last.first.qvalue = bh_value # give the top hit the q-value
115
+ end
116
+
117
+ sorted_pval_index_tuples.map(&:last)
118
+ end
119
+
120
+ def create_search_function(ions, opt)
121
+
122
+ db_isobar_spectrum = create_db_isobar_spectrum(ions)
123
+
124
+ search_bins = create_search_bins(db_isobar_spectrum, opt[:query_min_count_per_bin])
125
+
126
+ create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, opt[:num_rand_samples_per_bin], opt[:ppm])
127
+
128
+ # create the actual search function
129
+ # returns an array of hit_groups
130
+ lambda do |search_queries, num_nearest_hits|
131
+ Bin.bin(search_bins, search_queries, &:mz)
132
+ search_bins_with_data = search_bins.reject {|bin| bin.data.empty? }
133
+ hit_groups = search_bins_with_data.map {|bin| bin.queries_to_hit_groups!(opt[:num_nearest]) }.flatten(1)
134
+ end
135
+ end
136
+
137
+ #####################################################
138
+ # Ancillary to create_search_function:
139
+ #####################################################
140
+
141
+ # returns a DB isobar spectrum where the m/z values are all the m/z
142
+ # values to search for and the intensities each an array corresponding
143
+ # to all the lipid ions matching that m/z value
144
+ def create_db_isobar_spectrum(ions)
145
+ mzs = [] ; query_groups = []
146
+ pairs = ions.group_by(&:mz).sort_by(&:first)
147
+ pairs.each {|mz, ar| mzs << mz ; query_groups << ar }
148
+ Mspire::Spectrum.new([mzs, query_groups])
149
+ end
150
+
151
+ # use_ppm uses ppm or amu if false
152
+ # returns the search_bins
153
+ def create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, num_rand_samples_per_bin, use_ppm=true)
154
+ search_bins.each do |search_bin|
155
+ rng = Random.new
156
+ random_mzs = num_rand_samples_per_bin.times.map { rng.rand(search_bin.to_range) }
157
+ # find the deltas
158
+ diffs = random_mzs.map do |random_mz|
159
+ nearest_random_mz = db_isobar_spectrum.find_nearest(random_mz)
160
+ delta = (random_mz - nearest_random_mz).abs
161
+ use_ppm ? delta./(nearest_random_mz).*(1e6) : delta
162
+ end
163
+ search_bin.probability_distribution = ProbabilityDistribution.deviations_to_probability_distribution((use_ppm ? :ppm : :amu), diffs)
164
+ end
165
+ search_bins
166
+ end
167
+
168
+ def create_search_bins(db_isobar_spectrum, min_n_per_bin)
169
+ # make sure we get the right bin size based on the input
170
+ ss = db_isobar_spectrum.mzs.size ; optimal_num_groups = 1
171
+ (1..ss).each do |divisions|
172
+ if (ss.to_f / divisions) >= min_n_per_bin
173
+ optimal_num_groups = divisions
174
+ else ; break
175
+ end
176
+ end
177
+
178
+ mz_ranges = []
179
+ prev = nil
180
+
181
+ groups = db_isobar_spectrum.points.in_groups(optimal_num_groups,false).to_a
182
+
183
+ case groups.size
184
+ when 0
185
+ raise 'I think you need some data in your query spectrum!'
186
+ when 1
187
+ group = groups.first
188
+ [ Mspire::Lipid::Search::Bin.new( Range.new(group.first.first, group.last.first), db_isobar_spectrum ) ]
189
+ else
190
+ search_bins = groups.each_cons(2).map do |points1, points2|
191
+ bin = Mspire::Lipid::Search::Bin.new( Range.new(points1.first.first, points2.first.first, true), db_isobar_spectrum )
192
+ prev = points2
193
+ bin
194
+ end
195
+ _range = Range.new(prev.first.first, prev.last.first)
196
+ search_bins << Mspire::Lipid::Search::Bin.new(_range, db_isobar_spectrum) # inclusive
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+
203
+
204
+
205
+
@@ -0,0 +1,79 @@
1
+ require 'mspire/bin'
2
+
3
+ module Mspire
4
+ class Lipid
5
+ class Search
6
+
7
+ # A Search::Bin is a range that contains the *entire* query spectrum
8
+ # (not just the portion covered by the range). the query spectrum, and
9
+ # a ProbabilityDistribution -- the probability that a peak's delta to
10
+ # nearest peak is that small by chance.
11
+ class Bin < Mspire::Bin
12
+ # the intensity value of the query spectrum should be a query
13
+ attr_accessor :db_spectrum
14
+ attr_accessor :probability_distribution
15
+
16
+ def initialize(range_obj, db_spectrum)
17
+ super(range_obj.begin, range_obj.end, range_obj.exclude_end?)
18
+ @db_spectrum = db_spectrum
19
+ end
20
+
21
+ def <<(query)
22
+ @data << query
23
+ end
24
+
25
+ # returns the nearest num_hits Mspire::Lipid::Search::Hits sorted by delta
26
+ # [with tie going to the lower m/z]
27
+ # searches all queries and removes them from the data queue
28
+ def queries_to_hit_groups!(num_hits=1)
29
+ queries = @data.dup
30
+ @data.clear
31
+
32
+ @db_isobar_groups_by_index = @db_spectrum.intensities
33
+
34
+ hit_groups = queries.map do |query|
35
+ best_hits(query, num_hits)
36
+ end
37
+
38
+ all_top_hits = hit_groups.map(&:first)
39
+
40
+ # updates the pvalues for all the hits
41
+ pvalues = probability_distribution.pvalues( all_top_hits )
42
+ all_top_hits.zip(pvalues) {|hit, pvalue| hit.pvalue = pvalue }
43
+
44
+ hit_groups
45
+ end
46
+
47
+ # returns a HitGroup object
48
+ def best_hits(query, num_hits)
49
+ query_mz = query.mz
50
+ #puts "MZ: #{query_mz}"
51
+ db_mzs = @db_spectrum.mzs
52
+ index = @db_spectrum.find_nearest_index(query_mz)
53
+ _min = index - (num_hits-1)
54
+ (_min >= 0) || (_min = 0)
55
+ _max = index + (num_hits-1)
56
+ (_max < db_mzs.size) || (_max = @db_spectrum - 1)
57
+ delta_index_pairs = (_min.._max).map {|i| [query_mz.-(db_mzs[i]).abs, i] }
58
+ closest_delta_index_pairs = delta_index_pairs.sort
59
+ top_num_hits_delta_index_pairs = closest_delta_index_pairs[0, num_hits]
60
+ top_num_hit_indices = top_num_hits_delta_index_pairs.map(&:last)
61
+ hit_group = top_num_hit_indices.map do |index|
62
+ Hit.new( :db_isobar_group => @db_isobar_groups_by_index[index], :observed_mz => query_mz)
63
+ end
64
+ HitGroup.new(hit_group)
65
+ end
66
+
67
+ def inspect
68
+ "<(#{super}) @db_spectrum(points size)=#{db_spectrum.mzs.size} @probability_distribution=#{probability_distribution}>"
69
+ end
70
+
71
+ def to_range
72
+ Range.new( self.begin, self.end, self.exclude_end? )
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+
@@ -0,0 +1,20 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ # this is a group of Lipid::Ion objects that all have the same (or
6
+ # possibly similar) m/z
7
+ class DBIsobarGroup < Array
8
+ # it is implemented like this so that the isobar group *could* have
9
+ # individuals in it with slightly different m/z values and this coudl
10
+ # still be used as a container. In my current implementation they
11
+ # have exactly the same m/z
12
+ attr_accessor :mz
13
+ def initialize( ar=[], mz=nil)
14
+ @mz = mz if mz
15
+ self.replace(ar)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,79 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ class Hit
6
+ # the db_isobar_group this hit is associated with. Each hit is only
7
+ # associated with a single db_isobar_group!
8
+ attr_accessor :db_isobar_group
9
+ # the experimental m/z value
10
+ attr_accessor :observed_mz
11
+ # the probability the hit is due to random chance
12
+ attr_accessor :pvalue
13
+ # the FDR if the threshold accepts this pvalue. Note that this value
14
+ # is relative to the number of tests performed and not completely
15
+ # intrinsic to the hit itself.
16
+ attr_accessor :qvalue
17
+
18
+ # qvalue derived from decoy testing
19
+ attr_accessor :decoy_qvalue
20
+
21
+ # the probability distribution that can be used to determine its
22
+ # pvalue
23
+ attr_accessor :probability_distribution
24
+
25
+ def initialize(hash={})
26
+ hash.each {|k,v| instance_variable_set("@#{k}", v) }
27
+ end
28
+
29
+ # observed_mz - query m/z
30
+ def delta
31
+ @observed_mz - @db_isobar_group.first.mz.to_f
32
+ end
33
+
34
+ alias_method :amu, :delta
35
+
36
+ # the absolute value of distance from true val
37
+ def delta_abs
38
+ delta.abs
39
+ end
40
+
41
+ # parts per million (divided by theoretical m/z)
42
+ def ppm
43
+ (delta / @db_isobar_group.first.mz) * 1e6
44
+ end
45
+
46
+ def theoretical_mz
47
+ @db_isobar_group.first.mz
48
+ end
49
+
50
+ def inspect
51
+ "<<#{super} -- <ppm=#{ppm} delta=#{delta} theoretical_mz=#{theoretical_mz}>>"
52
+ end
53
+ end
54
+
55
+ # A query that matched multiple items. Each search returns a hit group
56
+ # which consists of the best hits for that experimental m/z. When
57
+ # queried for values like delta or ppm, it will delegate to the first hit.
58
+ # So, in many ways it can be used as a container for hits, but it puts
59
+ # its best face forward.
60
+ class HitGroup < Array
61
+
62
+ # should implement with delegator obviously...
63
+ # should allow setting ???
64
+
65
+ def delta() first.delta end
66
+ def ppm() first.ppm end
67
+ def theoretical_mz() first.theoretical_mz end
68
+ def query_group() first.query_group end
69
+ def observed_mz() first.observed_mz end
70
+ def pvalue() ; first.pvalue end
71
+ def qvalue() ; first.qvalue end
72
+ def decoy_qvalue() ; first.decoy_qvalue end
73
+
74
+ def best_hit() first end
75
+
76
+ end
77
+ end
78
+ end
79
+ end