mspire-lipidomics 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ class Hit
6
+ # the db_isobar_group this hit is associated with. Each hit is only
7
+ # associated with a single db_isobar_group!
8
+ attr_accessor :db_isobar_group
9
+ # the experimental m/z value
10
+ attr_accessor :observed_mz
11
+ # the probability the hit is due to random chance
12
+ attr_accessor :pvalue
13
+ # the FDR if the threshold accepts this pvalue. Note that this value
14
+ # is relative to the number of tests performed and not completely
15
+ # intrinsic to the hit itself.
16
+ attr_accessor :qvalue
17
+
18
+ # qvalue derived from decoy testing
19
+ attr_accessor :decoy_qvalue
20
+
21
+ # the probability distribution that can be used to determine its
22
+ # pvalue
23
+ attr_accessor :probability_distribution
24
+
25
+ def initialize(hash={})
26
+ hash.each {|k,v| instance_variable_set("@#{k}", v) }
27
+ end
28
+
29
+ # observed_mz - query m/z
30
+ def delta
31
+ @observed_mz - @db_isobar_group.first.mz.to_f
32
+ end
33
+
34
+ alias_method :amu, :delta
35
+
36
+ # the absolute value of distance from true val
37
+ def delta_abs
38
+ delta.abs
39
+ end
40
+
41
+ # parts per million (divided by theoretical m/z)
42
+ def ppm
43
+ (delta / @db_isobar_group.first.mz) * 1e6
44
+ end
45
+
46
+ def theoretical_mz
47
+ @db_isobar_group.first.mz
48
+ end
49
+
50
+ def inspect
51
+ "<<#{super} -- <ppm=#{ppm} delta=#{delta} theoretical_mz=#{theoretical_mz}>>"
52
+ end
53
+ end
54
+
55
+ # A query that matched multiple items. Each search returns a hit group
56
+ # which consists of the best hits for that experimental m/z. When
57
+ # queried for values like delta or ppm, it will delegate to the first hit.
58
+ # So, in many ways it can be used as a container for hits, but it puts
59
+ # its best face forward.
60
+ class HitGroup < Array
61
+
62
+ # should implement with delegator obviously...
63
+ # should allow setting ???
64
+
65
+ def delta() first.delta end
66
+ def ppm() first.ppm end
67
+ def theoretical_mz() first.theoretical_mz end
68
+ def query_group() first.query_group end
69
+ def observed_mz() first.observed_mz end
70
+ def pvalue() ; first.pvalue end
71
+ def qvalue() ; first.qvalue end
72
+ def decoy_qvalue() ; first.decoy_qvalue end
73
+
74
+ def best_hit() first end
75
+
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,50 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ class Search
5
+ class ProbabilityDistribution
6
+ DEFAULT_TYPE = :ppm
7
+ R = Rserve::Simpler.new
8
+
9
+ # takes location, scale and shape parameters
10
+ attr_accessor :location, :scale, :shape
11
+ # type is :ppm or :delta_abs
12
+ attr_accessor :type
13
+ def initialize(location, scale, shape, type=DEFAULT_TYPE)
14
+ @location, @scale, @shape = location, scale, shape
15
+ @type = type
16
+ end
17
+
18
+ # takes a deviation and returns the pvalue
19
+ def pvalue(hit)
20
+ R.converse "pgev(log(#{hit.send(type)}), #{@location}, #{@scale}, #{@shape})"
21
+ end
22
+
23
+ # same as pvalue, just tries to limit the number of calls to R to
24
+ # speed things up!
25
+ def pvalues(hits)
26
+ deltas = hits.map {|v| v.send(type).abs }
27
+ reply = R.converse("sapply(r_devs, function(elt) pgev(log(elt), #{@location}, #{@scale}, #{@shape}))", :r_devs => deltas)
28
+ reply.is_a?(Array) ? reply : [reply]
29
+ end
30
+
31
+ def self.require_r_library(lib)
32
+ reply = R.converse "library(#{lib})"
33
+ unless reply.size > 4 # ~roughly
34
+ $stderr.puts "The libraries ismev and evd must be installed in your R env!"
35
+ $stderr.puts "From within R (works best if R is started with sudo or root for installing):"
36
+ $stderr.puts %Q{install.packages("ismev") ; install.packages("evd")}
37
+ raise "must have R (rserve) and ismev and evd installed!"
38
+ end
39
+ end
40
+
41
+ # returns an EVD object
42
+ def self.deviations_to_probability_distribution(type, devs)
43
+ %w(ismev evd).each {|lib| require_r_library(lib) }
44
+ params = R.converse("m <- gev.fit(log(devs_r))\n c(m$mle[1], m$mle[2], m$mle[3])", :devs_r => devs )
45
+ self.new(*params, type)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,23 @@
1
+
2
+
3
+
4
+ module Mspire
5
+ class Lipid
6
+ class Search
7
+ class Query
8
+
9
+ # the experimentally observed lowest mz
10
+ attr_accessor :mz
11
+
12
+ # the index of search spectrum that the m/z was derived from
13
+ # this allows for the creation of an isotope envelope starting from a
14
+ # particular m/z value.
15
+ attr_accessor :index
16
+
17
+ def initialize(mz, index)
18
+ @mz, @index = mz, index
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,205 @@
1
+ require 'mspire/spectrum'
2
+ require 'rserve/simpler' # TODO: move to integrated interface with rserve when available
3
+ require 'core_ext/array/in_groups'
4
+ require 'mspire/lipid/search/hit'
5
+ require 'mspire/lipid/search/bin'
6
+ require 'mspire/lipid/modification'
7
+ require 'mspire/lipid/search/probability_distribution'
8
+
9
+ module Mspire
10
+ class Lipid
11
+ class Search
12
+ STANDARD_MODIFICATIONS = {
13
+ :proton => [1,2],
14
+ :ammonium => [1],
15
+ :lithium => [1],
16
+ :water => [1,2],
17
+ }
18
+ STANDARD_SEARCH = {
19
+ :units => :ppm,
20
+ :query_min_count_per_bin => 500, # min number of peaks per bin
21
+ :num_rand_samples_per_bin => 1000,
22
+ :num_nearest => 2,
23
+ :return_order => :as_given, # or :sorted
24
+ }
25
+
26
+ attr_accessor :options
27
+ attr_accessor :search_function
28
+
29
+ # will generate PossibleLipid objects and return a new search object
30
+ # uses only one kind of loss at a time and one type of gain at a time
31
+ # will also do the combination of a gain and a loss if gain_and_loss is
32
+ # true
33
+ def self.generate_simple_queries(lipids, mods=STANDARD_MODIFICATIONS, gain_and_loss=false)
34
+ possible_lipids = []
35
+ real_mods_and_cnts = mods.map {|name, cnts| [Mspire::Lipid::Modification.new(name), cnts] }
36
+ # one of each
37
+ real_mods_and_cnts.each do |mod, counts|
38
+ counts.each do |cnt|
39
+ possible_lipids << Mspire::Lipid::Search::Query.new(lipid, Array.new(cnt, mod))
40
+ end
41
+ end
42
+ if gain_and_loss
43
+ # one of each gain + one of each loss
44
+ (gain_mod_cnt_pairs, loss_mod_cnt_pairs) = real_mods_and_cnts.partition {|mod, count| mod.gain }
45
+ gain_mod_cnt_pairs.each do |mod, cnt|
46
+ lipids.each do |lipid|
47
+ #### need to implement still (use combinations or something...)
48
+ get_this_working!
49
+ end
50
+ end
51
+ end
52
+ self.new(possible_lipids)
53
+ end
54
+
55
+ # ions are Mspire::Lipid::Ion objects
56
+ # each one should give a non-nil m/z value
57
+ def initialize(ions=[], opts={})
58
+ @options = STANDARD_SEARCH.merge(opts)
59
+ @db_isobar_spectrum = create_db_isobar_spectrum(ions)
60
+ @search_function = create_search_function(ions, @options)
61
+ end
62
+
63
+ # returns an array of HitGroup and a parallel array of BH derived
64
+ # q-values (will switch to Storey soon enough). The HitGroups are
65
+ # returned in the order in which the mz_values are given.
66
+ # assumes search_queries are in ascending m/z order
67
+ def search(search_queries, opts={})
68
+ opt = @options.merge( opts )
69
+ hit_groups = @search_function.call(search_queries, opt[:num_nearest])
70
+ sorted_hit_groups = qvalues!(hit_groups, opt)
71
+ case opts[:return_order]
72
+ when :given
73
+ hit_groups
74
+ when :sorted
75
+ sorted_hit_groups
76
+ else
77
+ raise ArgumentError, "invalid :return_order"
78
+ end
79
+ end
80
+
81
+ def qvalues!(hit_groups, opts)
82
+
83
+ # from http://stats.stackexchange.com/questions/870/multiple-hypothesis-testing-correction-with-benjamini-hochberg-p-values-or-q-va
84
+ # but I've already coded this up before, too, in multiple ways...
85
+ prev_bh_value = 0
86
+ num_total_tests = hit_groups.size
87
+
88
+ #hit_groups.each {|hg| p [hg.first.pvalue, hg] }
89
+
90
+ # calculate Q-values BH style for now:
91
+ # first hit is the best hit in the group
92
+ pval_hg_index_tuples = hit_groups.each_with_index.map {|hg,i| [hg.pvalue, hg.delta.abs, hg.ppm.abs, i, hg] }
93
+
94
+ if pval_hg_index_tuples.any? {|pair| pair.first.nan? }
95
+ $stderr.puts "pvalue of NaN!"
96
+ $stderr.puts ">>> Consider increasing query_min_count_per_bin or setting ppm to false <<<"
97
+ raise
98
+ end
99
+
100
+ sorted_pval_index_tuples = pval_hg_index_tuples.sort
101
+
102
+ sorted_pval_index_tuples.each_with_index do |tuple,i|
103
+ pval = tuple.first
104
+ bh_value = pval * num_total_tests / (i + 1)
105
+ # Sometimes this correction can give values greater than 1,
106
+ # so we set those values at 1
107
+ bh_value = [bh_value, 1].min
108
+
109
+ # To preserve monotonicity in the values, we take the
110
+ # maximum of the previous value or this one, so that we
111
+ # don't yield a value less than the previous.
112
+ bh_value = [bh_value, prev_bh_value].max
113
+ prev_bh_value = bh_value
114
+ tuple.last.first.qvalue = bh_value # give the top hit the q-value
115
+ end
116
+
117
+ sorted_pval_index_tuples.map(&:last)
118
+ end
119
+
120
+ def create_search_function(ions, opt)
121
+
122
+ db_isobar_spectrum = create_db_isobar_spectrum(ions)
123
+
124
+ search_bins = create_search_bins(db_isobar_spectrum, opt[:query_min_count_per_bin])
125
+
126
+ create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, opt[:num_rand_samples_per_bin], opt[:ppm])
127
+
128
+ # create the actual search function
129
+ # returns an array of hit_groups
130
+ lambda do |search_queries, num_nearest_hits|
131
+ Bin.bin(search_bins, search_queries, &:mz)
132
+ search_bins_with_data = search_bins.reject {|bin| bin.data.empty? }
133
+ hit_groups = search_bins_with_data.map {|bin| bin.queries_to_hit_groups!(opt[:num_nearest]) }.flatten(1)
134
+ end
135
+ end
136
+
137
+ #####################################################
138
+ # Ancillary to create_search_function:
139
+ #####################################################
140
+
141
+ # returns a DB isobar spectrum where the m/z values are all the m/z
142
+ # values to search for and the intensities each an array corresponding
143
+ # to all the lipid ions matching that m/z value
144
+ def create_db_isobar_spectrum(ions)
145
+ mzs = [] ; query_groups = []
146
+ pairs = ions.group_by(&:mz).sort_by(&:first)
147
+ pairs.each {|mz, ar| mzs << mz ; query_groups << ar }
148
+ Mspire::Spectrum.new([mzs, query_groups])
149
+ end
150
+
151
+ # use_ppm uses ppm or amu if false
152
+ # returns the search_bins
153
+ def create_probability_distribution_for_search_bins!(search_bins, db_isobar_spectrum, num_rand_samples_per_bin, use_ppm=true)
154
+ search_bins.each do |search_bin|
155
+ rng = Random.new
156
+ random_mzs = num_rand_samples_per_bin.times.map { rng.rand(search_bin.to_range) }
157
+ # find the deltas
158
+ diffs = random_mzs.map do |random_mz|
159
+ nearest_random_mz = db_isobar_spectrum.find_nearest(random_mz)
160
+ delta = (random_mz - nearest_random_mz).abs
161
+ use_ppm ? delta./(nearest_random_mz).*(1e6) : delta
162
+ end
163
+ search_bin.probability_distribution = ProbabilityDistribution.deviations_to_probability_distribution((use_ppm ? :ppm : :amu), diffs)
164
+ end
165
+ search_bins
166
+ end
167
+
168
+ def create_search_bins(db_isobar_spectrum, min_n_per_bin)
169
+ # make sure we get the right bin size based on the input
170
+ ss = db_isobar_spectrum.mzs.size ; optimal_num_groups = 1
171
+ (1..ss).each do |divisions|
172
+ if (ss.to_f / divisions) >= min_n_per_bin
173
+ optimal_num_groups = divisions
174
+ else ; break
175
+ end
176
+ end
177
+
178
+ mz_ranges = []
179
+ prev = nil
180
+
181
+ groups = db_isobar_spectrum.points.in_groups(optimal_num_groups,false).to_a
182
+
183
+ case groups.size
184
+ when 0
185
+ raise 'I think you need some data in your query spectrum!'
186
+ when 1
187
+ group = groups.first
188
+ [ Mspire::Lipid::Search::Bin.new( Range.new(group.first.first, group.last.first), db_isobar_spectrum ) ]
189
+ else
190
+ search_bins = groups.each_cons(2).map do |points1, points2|
191
+ bin = Mspire::Lipid::Search::Bin.new( Range.new(points1.first.first, points2.first.first, true), db_isobar_spectrum )
192
+ prev = points2
193
+ bin
194
+ end
195
+ _range = Range.new(prev.first.first, prev.last.first)
196
+ search_bins << Mspire::Lipid::Search::Bin.new(_range, db_isobar_spectrum) # inclusive
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
202
+
203
+
204
+
205
+
@@ -0,0 +1,19 @@
1
+
2
+ module Mspire
3
+ class Lipid
4
+ def self.members
5
+ [:lm_id,:common_name,:systematic_name,:formula,:mass,:category,:main_class,:sub_class,:pubchem_id,:inchi_key,:kegg_id,:chebi_id,:structure]
6
+ end
7
+
8
+ members.each {|mem| attr_accessor mem }
9
+
10
+ def initialize(*args)
11
+ (@lm_id,@common_name,@systematic_name,@formula,@mass,@category,@main_class,@sub_class,@pubchem_sid, @inchi_key, @kegg_id, @chebi_id, @structure) = args
12
+ end
13
+
14
+ def inspect
15
+ cut_common_name = (common_name.size <= 20) ? common_name : (common_name[0,20]+"...")
16
+ "<#{lm_id}: #{formula}: #{mass} #{cut_common_name}>"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,87 @@
1
+ require 'mspire/lipid'
2
+ require 'mspire/mass'
3
+
4
+ module Mspire
5
+ module LipidMaps
6
+
7
+ DEFAULTS = {
8
+ :high_res_mass => true,
9
+ :rubabel_molecules => false,
10
+ :molecular_formula_objects => true,
11
+ }
12
+
13
+ # returns an array of Lipids
14
+ # if high_res_mass is true (default), then the formula is used to calculate a higher
15
+ # resolution mass than what is in lipidmaps
16
+ #
17
+ # :high_res_mass => true (ensures that a high res mass is present or calculated)
18
+ def self.parse_file(lipidmaps_tsv, opts={})
19
+ require 'rubabel' if opts[:rubabel_molecules]
20
+
21
+ opts = DEFAULTS.merge(opts)
22
+
23
+ io = File.open(lipidmaps_tsv)
24
+ header = io.readline.split("\t")
25
+ # the lipidmaps_filetype
26
+ lm_ft = case header.size
27
+ when 8
28
+ :programmatic
29
+ when 20
30
+ :download
31
+ when 21
32
+ :download_sd
33
+ end
34
+ index_mapping =
35
+ case lm_ft
36
+ when :programmatic
37
+ (0...(Mspire::Lipid.members.size)).to_a
38
+ when :download, :download_sd
39
+ indices = {
40
+ :lm_id => 0,
41
+ :systematic_name => 1,
42
+ :category => 3,
43
+ :main_class => 4,
44
+ :mass => 5,
45
+ :formula => 6,
46
+ :pubchem_id => 7,
47
+ :inchi_key => 8,
48
+ :common_name => 11,
49
+ :kegg_id => 12,
50
+ :chebi_id => 13,
51
+ :sub_class => 14,
52
+ :structure => 20,
53
+ }
54
+ Mspire::Lipid.members.map {|key| indices[key] }
55
+ end
56
+
57
+ formula_i = index_mapping[Mspire::Lipid.members.index(:formula)]
58
+
59
+ lipids = io.each_line.map do |line|
60
+ line.chomp!
61
+ data = line.split("\t")
62
+ if data[formula_i] =~ /[A-Z]/ # <- there is a formula!
63
+ lipid = Mspire::Lipid.new( *index_mapping.map {|i| data[i] } )
64
+ lipid.mass = lipid.mass.to_f
65
+ lipid
66
+ end
67
+ end.compact
68
+
69
+ if opts.values_at(:molecular_formula_objects, :rubabel_molecules).any? || (opts[:high_res_mass] && lm_ft == :programmatic)
70
+ lipids.each do |lipid|
71
+ if opts[:molecular_formula_objects]
72
+ lipid.formula = Mspire::MolecularFormula.new(lipid.formula)
73
+ end
74
+ if lm_ft == :programmatic && opts[:high_res_mass]
75
+ lipid.mass = Mspire::Mass.formula_to_exact_mass(lipid.formula)
76
+ end
77
+ if opts[:rubabel_molecules]
78
+ lipid.structure = Rubabel::Molecule.from_string(lipid.structure.gsub('|', "\n"), :sdf)
79
+ end
80
+ end
81
+ end
82
+ lipids
83
+ end
84
+ end
85
+ end
86
+
87
+
@@ -0,0 +1,85 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "mspire-lipidomics"
8
+ s.version = "0.1.4"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["John T. Prince"]
12
+ s.date = "2012-05-10"
13
+ s.description = "does lipidomics"
14
+ s.email = "jtprince@gmail.com"
15
+ s.executables = ["lipidomic-search.rb"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ ".rspec",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "bin/lipidomic-search.rb",
28
+ "lib/mspire/lipid.rb",
29
+ "lib/mspire/lipid/ion.rb",
30
+ "lib/mspire/lipid/ion/fragment.rb",
31
+ "lib/mspire/lipid/modification.rb",
32
+ "lib/mspire/lipid/search.rb",
33
+ "lib/mspire/lipid/search/bin.rb",
34
+ "lib/mspire/lipid/search/db_isobar_group.rb",
35
+ "lib/mspire/lipid/search/hit.rb",
36
+ "lib/mspire/lipid/search/probability_distribution.rb",
37
+ "lib/mspire/lipid/search/query.rb",
38
+ "lib/mspire/lipid_maps.rb",
39
+ "scratch/OBConversion_methods.txt",
40
+ "scratch/atom_methods.txt",
41
+ "scratch/bond_methods.txt",
42
+ "scratch/mol_methods.txt",
43
+ "scratch/split_molecules.rb",
44
+ "script/find_nearest_lipid.rb",
45
+ "spec/mspire/lipid/ion_spec.rb",
46
+ "spec/mspire/lipid/modification_spec.rb",
47
+ "spec/mspire/lipid/search_spec.rb",
48
+ "spec/mspire/lipid_maps_spec.rb",
49
+ "spec/mspire/lipid_spec.rb",
50
+ "spec/spec_helper.rb",
51
+ "spec/testfiles/lipidmaps_download.tsv",
52
+ "spec/testfiles/lipidmaps_programmatic_short.tsv",
53
+ "spec/testfiles/lipidmaps_sd_download.tsv"
54
+ ]
55
+ s.homepage = "http://github.com/princelab/mspire-lipidomics"
56
+ s.licenses = ["MIT"]
57
+ s.require_paths = ["lib"]
58
+ s.rubygems_version = "1.8.18"
59
+ s.summary = "mass spectrometry based lipidomics - especially shotgun lipidomics"
60
+
61
+ if s.respond_to? :specification_version then
62
+ s.specification_version = 3
63
+
64
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
65
+ s.add_runtime_dependency(%q<mspire>, [">= 0.7.8"])
66
+ s.add_development_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
67
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
68
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
69
+ s.add_development_dependency(%q<rcov>, [">= 0"])
70
+ else
71
+ s.add_dependency(%q<mspire>, [">= 0.7.8"])
72
+ s.add_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
73
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
74
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
75
+ s.add_dependency(%q<rcov>, [">= 0"])
76
+ end
77
+ else
78
+ s.add_dependency(%q<mspire>, [">= 0.7.8"])
79
+ s.add_dependency(%q<rubabel>= 0.1.0>, [">= 0"])
80
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
81
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
82
+ s.add_dependency(%q<rcov>, [">= 0"])
83
+ end
84
+ end
85
+
@@ -0,0 +1,47 @@
1
+ get_in_stream
2
+ get_out_stream
3
+ set_in_stream
4
+ set_out_stream
5
+ set_in_and_out_formats
6
+ set_in_format
7
+ set_out_format
8
+ get_in_format
9
+ get_out_format
10
+ get_in_filename
11
+ get_in_pos
12
+ get_in_len
13
+ get_title
14
+ get_aux_conv
15
+ set_aux_conv
16
+ is_option
17
+ get_options
18
+ add_option
19
+ remove_option
20
+ set_options
21
+ copy_options
22
+ get_supported_input_format
23
+ get_supported_output_format
24
+ convert
25
+ full_convert
26
+ add_chem_object
27
+ get_chem_object
28
+ is_last
29
+ is_first_input
30
+ set_first_input
31
+ get_output_index
32
+ set_output_index
33
+ set_more_files_to_come
34
+ set_one_object_only
35
+ set_last
36
+ is_last_file
37
+ get_count
38
+ write
39
+ write_string
40
+ write_file
41
+ close_out_file
42
+ read
43
+ read_string
44
+ read_file
45
+ open_in_and_out_files
46
+ report_number_converted
47
+ num_input_objects