bioinform 0.1.17 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
@@ -1,47 +1,53 @@
|
|
1
1
|
require_relative '../../bioinform'
|
2
|
-
require '
|
2
|
+
require 'optparse'
|
3
3
|
require 'shellwords'
|
4
4
|
|
5
5
|
module Bioinform
|
6
|
-
module CLI
|
6
|
+
module CLI
|
7
7
|
module PCM2PWM
|
8
8
|
extend Bioinform::CLI::Helpers
|
9
9
|
def self.main(argv)
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
10
|
+
options = {folder: '.', extension: 'pwm'}
|
11
|
+
opt_parser = OptionParser.new do |opts|
|
12
|
+
opts.banner = "PCM to PWM converter.\n" +
|
13
|
+
"It transforms files with PCMs into files with PWMs.\n" +
|
14
|
+
"Folder for resulting files to save files can be specified.\n" +
|
15
|
+
"Resulting PWM files have the same name as original file but have another extension (.pwm by default).\n" +
|
16
|
+
"When filelist is empty, it's obtained from STDIN.\n" +
|
17
|
+
"One can use it: `ls -b pcm_folder/*.pcm | pcm2pwm` (ls -b option escape spaces in filenames)\n" +
|
18
|
+
"\n" +
|
19
|
+
"Usage:\n" +
|
20
|
+
" pcm2pwm [options] [<pcm-files>...]"
|
21
|
+
opts.version = ::Bioinform::VERSION
|
22
|
+
opts.on('-e', '--extension EXT', 'Extension of output files [default: pwm]') do |v|
|
23
|
+
options[:extension] = v
|
24
|
+
end
|
25
|
+
opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
|
26
|
+
options[:folder] = v
|
27
|
+
end
|
28
|
+
end
|
26
29
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
+
opt_parser.parse!(argv)
|
31
|
+
pcm_files = argv
|
32
|
+
folder = options[:folder]
|
33
|
+
extension = options[:extension]
|
30
34
|
|
31
35
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
32
36
|
filelist = (pcm_files.empty?) ? $stdin.read.shellsplit : pcm_files
|
33
37
|
|
38
|
+
converter = ConversionAlgorithms::PCM2PWMConverter.new()
|
39
|
+
|
34
40
|
filelist.each do |filename|
|
35
|
-
|
41
|
+
input = File.read(filename)
|
42
|
+
motif_data = MatrixParser.new.parse(input)
|
43
|
+
pcm = MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
|
44
|
+
pwm = converter.convert(pcm)
|
36
45
|
File.open(change_folder_and_extension(filename, extension, folder), 'w') do |f|
|
37
46
|
f.puts pwm
|
38
47
|
end
|
39
48
|
end
|
40
|
-
|
41
|
-
rescue Docopt::Exit => e
|
42
|
-
puts e.message
|
43
49
|
end
|
44
50
|
|
45
51
|
end
|
46
52
|
end
|
47
|
-
end
|
53
|
+
end
|
@@ -1,47 +1,48 @@
|
|
1
1
|
require_relative '../../bioinform'
|
2
|
-
require '
|
2
|
+
require 'optparse'
|
3
3
|
|
4
4
|
module Bioinform
|
5
5
|
module CLI
|
6
6
|
module SplitMotifs
|
7
7
|
extend Bioinform::CLI::Helpers
|
8
8
|
def self.main(argv)
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
9
|
+
options = {folder: '.'}
|
10
|
+
opt_parser = OptionParser.new do |opts|
|
11
|
+
opts.version = ::Bioinform::VERSION
|
12
|
+
opts.banner = "Motif splitter.\n" +
|
13
|
+
"It gets a file with a set of motifs and splits it into motifs according to their names.\n" +
|
14
|
+
"\n" +
|
15
|
+
"Usage:\n" +
|
16
|
+
" split_motifs [options] <collection-file>"
|
17
|
+
opts.on('-e', '--extension EXT', 'Extension of output files') do |v|
|
18
|
+
options[:extension] = v
|
19
|
+
end
|
20
|
+
opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
|
21
|
+
options[:folder] = v
|
22
|
+
end
|
23
|
+
end
|
21
24
|
|
22
|
-
|
23
|
-
|
25
|
+
opt_parser.parse!(argv)
|
26
|
+
folder = options[:folder]
|
27
|
+
extension = options[:extension]
|
28
|
+
collection_filename = argv.first
|
24
29
|
|
25
|
-
folder = options['--folder']
|
26
|
-
extension = options['--extension']
|
27
|
-
collection_filename = options['<collection-file>']
|
28
30
|
|
29
31
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
30
|
-
raise "
|
32
|
+
raise "Collection file not specified" unless collection_filename
|
33
|
+
raise "File `#{collection_filename}` not exist" unless File.exist?(collection_filename)
|
31
34
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
motif_list_string = File.read(collection_filename)
|
36
|
+
coll = MotifSplitter.new.split(motif_list_string).map do |motif_string|
|
37
|
+
motif_info = MatrixParser.new.parse(motif_string)
|
38
|
+
MotifModel::PM.new(motif_info[:matrix]).named(motif_info[:name])
|
39
|
+
end
|
40
|
+
|
41
|
+
coll.each do |motif|
|
42
|
+
File.open(set_folder(folder, set_extension(motif.name, extension || 'mat')), 'w'){|f| f.puts motif }
|
40
43
|
end
|
41
|
-
rescue Docopt::Exit => e
|
42
|
-
puts e.message
|
43
44
|
end
|
44
45
|
|
45
46
|
end
|
46
47
|
end
|
47
|
-
end
|
48
|
+
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
require_relative 'conversion_algorithms/pcm2ppm_converter'
|
2
|
+
require_relative 'conversion_algorithms/pcm2pwm_converter'
|
3
|
+
require_relative 'conversion_algorithms/ppm2pcm_converter'
|
4
|
+
require_relative 'conversion_algorithms/pwm2pcm_converter'
|
5
|
+
require_relative 'conversion_algorithms/pwm2iupac_pwm_converter'
|
6
|
+
require_relative 'conversion_algorithms/pcm2pwm_mara_converter'
|
@@ -1,19 +1,21 @@
|
|
1
|
+
require_relative '../data_models'
|
2
|
+
|
1
3
|
module Bioinform
|
2
4
|
module ConversionAlgorithms
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
def self.convert(pcm, parameters = {})
|
5
|
+
class PCM2PPMConverter
|
6
|
+
def convert(pcm)
|
7
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
7
8
|
matrix = pcm.each_position.map do |pos|
|
8
|
-
pos.
|
9
|
-
|
10
|
-
|
9
|
+
count = pos.inject(0.0, &:+)
|
10
|
+
pos.map {|el| el / count }
|
11
|
+
end
|
12
|
+
ppm = MotifModel::PPM.new(matrix)
|
13
|
+
if pcm.respond_to? :name
|
14
|
+
ppm.named(pcm.name)
|
15
|
+
else
|
16
|
+
ppm
|
11
17
|
end
|
12
|
-
PPM.new(pcm.get_parameters.merge(matrix: matrix))
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|
16
21
|
end
|
17
|
-
|
18
|
-
|
19
|
-
|
@@ -1,20 +1,48 @@
|
|
1
|
+
require_relative '../errors'
|
2
|
+
require_relative '../data_models'
|
3
|
+
require_relative '../background'
|
4
|
+
|
1
5
|
module Bioinform
|
2
6
|
module ConversionAlgorithms
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
|
8
|
+
class PCM2PWMConverter
|
9
|
+
attr_reader :background, :pseudocount
|
10
|
+
def initialize(options = {})
|
11
|
+
@background = options.fetch(:background, Bioinform::Background::Uniform)
|
12
|
+
@pseudocount = options.fetch(:pseudocount, :log)
|
13
|
+
end
|
14
|
+
|
15
|
+
def calculate_pseudocount(pcm)
|
16
|
+
case @pseudocount
|
17
|
+
when Numeric
|
18
|
+
@pseudocount
|
19
|
+
when :log
|
20
|
+
Math.log(pcm.count)
|
21
|
+
when :sqrt
|
22
|
+
Math.sqrt(pcm.count)
|
23
|
+
when Proc
|
24
|
+
@pseudocount.call(pcm)
|
25
|
+
else
|
26
|
+
raise Error, 'Unknown pseudocount type use numeric or :log or :sqrt or Proc with taking pcm parameter'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def convert(pcm)
|
31
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
32
|
+
actual_pseudocount = calculate_pseudocount(pcm)
|
11
33
|
matrix = pcm.each_position.map do |pos|
|
34
|
+
count = pos.inject(0.0, &:+)
|
12
35
|
pos.each_index.map do |index|
|
13
|
-
Math.log((pos[index] +
|
36
|
+
Math.log((pos[index] + @background.frequencies[index] * actual_pseudocount).to_f / (@background.frequencies[index]*(count + actual_pseudocount)) )
|
14
37
|
end
|
15
38
|
end
|
16
|
-
PWM.new(
|
39
|
+
pwm = MotifModel::PWM.new(matrix)
|
40
|
+
if pcm.respond_to? :name
|
41
|
+
pwm.named(pcm.name)
|
42
|
+
else
|
43
|
+
pwm
|
44
|
+
end
|
17
45
|
end
|
18
46
|
end
|
19
47
|
end
|
20
|
-
end
|
48
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative '../errors'
|
2
|
+
require_relative '../data_models'
|
3
|
+
require_relative '../background'
|
4
|
+
|
5
|
+
module Bioinform
|
6
|
+
module ConversionAlgorithms
|
7
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
|
8
|
+
class MaraPCM2PWMConverter
|
9
|
+
def convert(pcm)
|
10
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
11
|
+
matrix = pcm.each_position.map do |pos|
|
12
|
+
count = pos.inject(0.0, &:+)
|
13
|
+
pos.each_index.map do |index|
|
14
|
+
Math.log((pos[index] + 0.5).to_f / (0.25 * (count + 2)) )
|
15
|
+
end
|
16
|
+
end
|
17
|
+
pwm = MotifModel::PWM.new(matrix)
|
18
|
+
if pcm.respond_to? :name
|
19
|
+
pwm.named(pcm.name)
|
20
|
+
else
|
21
|
+
pwm
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative '../errors'
|
2
|
+
require_relative '../data_models'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
module ConversionAlgorithms
|
6
|
+
class PPM2PCMConverter
|
7
|
+
attr_reader :count
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@count = options.fetch(:count, 100)
|
11
|
+
end
|
12
|
+
|
13
|
+
def convert(ppm)
|
14
|
+
raise Error, "#{self.class}#convert accepts only models acting as PPM" unless MotifModel.acts_as_ppm?(ppm)
|
15
|
+
matrix = ppm.each_position.map do |pos|
|
16
|
+
pos.map do |el|
|
17
|
+
el * @count
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
pcm = MotifModel::PCM.new(matrix)
|
22
|
+
if ppm.respond_to? :name
|
23
|
+
pcm.named(ppm.name)
|
24
|
+
else
|
25
|
+
pcm
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require_relative '../alphabet'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
module ConversionAlgorithms
|
5
|
+
class PWM2IupacPWMConverter
|
6
|
+
attr_reader :iupac_alphabet
|
7
|
+
def initialize(options = {})
|
8
|
+
@iupac_alphabet = options.fetch(:alphabet, NucleotideAlphabetWithN)
|
9
|
+
end
|
10
|
+
def convert(pwm)
|
11
|
+
raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
|
12
|
+
raise Error, 'this conversion is possible only for ACGT-nucleotide motifs' unless pwm.alphabet == NucleotideAlphabet
|
13
|
+
iupac_matrix = pwm.each_position.map do |pos|
|
14
|
+
@iupac_alphabet.each_letter.map do |letter|
|
15
|
+
nucleotide_indices = IUPAC::NucleotideIndicesByIUPACLetter[letter]
|
16
|
+
nucleotide_indices.inject(0.0){|sum, nucleotide_index| sum + pos[nucleotide_index] } / nucleotide_indices.size
|
17
|
+
end
|
18
|
+
end
|
19
|
+
MotifModel::PWM.new(iupac_matrix, alphabet: @iupac_alphabet)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require_relative '../data_models'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
module ConversionAlgorithms
|
5
|
+
|
6
|
+
# This algorithm is a purely heuristic based on our algorithm of PWM calculation.
|
7
|
+
# pcm --> pwm:
|
8
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}}) - \beta_{j}
|
9
|
+
# \beta_j is an arbitrary constant
|
10
|
+
# Hence
|
11
|
+
# pwm --> pcm:
|
12
|
+
# x_{\alpha,j} = (N + \cappa) p_{\alpha} \exp{ s_{\alpha,j} - \beta_j } - \cappa p_{\alpha}
|
13
|
+
# \beta_j = log(\sum_{\alpha}p_{\alpha}s_{\alpha,j}) because \sum_{\alpha} x_{\alpha,j} = N
|
14
|
+
class PWM2PCMConverter
|
15
|
+
attr_reader :pseudocount, :count, :background
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
@pseudocount = options.fetch(:pseudocount, :default)
|
19
|
+
@count = options.fetch(:count, 100.0)
|
20
|
+
@background = options.fetch(:background, Bioinform::Background::Uniform)
|
21
|
+
end
|
22
|
+
|
23
|
+
def calculate_pseudocount(pwm)
|
24
|
+
case @pseudocount
|
25
|
+
when Numeric
|
26
|
+
@pseudocount
|
27
|
+
when :default
|
28
|
+
# *0.95 is to guarantee that rounding errors won't exceed real max pseudocount and generate PCM with negative elements
|
29
|
+
max_pseudocount = max_pseudocount_fraction(pwm) * @count
|
30
|
+
(Math.log(@count) <= max_pseudocount*0.95) ? Math.log(@count) : max_pseudocount * 0.95
|
31
|
+
when Proc
|
32
|
+
@pseudocount.call(pwm)
|
33
|
+
else
|
34
|
+
raise Error, 'Unknown pseudocount type use numeric or :default or Proc with taking pcm parameter'
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# \sum p_{\alpha} s_{\alpha,j}
|
39
|
+
def weighted_position_exponent(pos)
|
40
|
+
pos.each_with_index.map {|elem, letter_index| @background.frequencies[letter_index] * Math.exp(elem) }.inject(0.0, &:+)
|
41
|
+
end
|
42
|
+
|
43
|
+
# possible (pseudocount / count) range is from 0 to max_pseudocount_fraction
|
44
|
+
# it's derived from
|
45
|
+
# (-\exp{s_{\alpha,j}} + \sum_{\alpha} p_{\alpha,j}\exp{s_{\alpha,j}}) * pseudocount < count * \exp{s_{\alpha,j}}
|
46
|
+
# which is derived from the fact that each element of PCM should be not less than 0
|
47
|
+
def max_pseudocount_fraction(pwm)
|
48
|
+
# min = 0.0
|
49
|
+
max = Float::INFINITY
|
50
|
+
pwm.each_position do |pos|
|
51
|
+
pos.each_with_index do |elem, letter_index|
|
52
|
+
coeff = weighted_position_exponent(pos) - Math.exp(elem)
|
53
|
+
if coeff > 0
|
54
|
+
max = [Math.exp(elem) / coeff, max].min
|
55
|
+
# elsif coeff < 0
|
56
|
+
# min = [Math.exp(elem) / coeff, min].max # Math.exp(elem) / coeff is always < 0 hence minimal pseudocount is zero
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
max
|
61
|
+
end
|
62
|
+
|
63
|
+
private :max_pseudocount_fraction, :weighted_position_exponent
|
64
|
+
|
65
|
+
def convert(pwm)
|
66
|
+
raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
|
67
|
+
actual_pseudocount = calculate_pseudocount(pwm)
|
68
|
+
matrix = pwm.each_position.map do |pos|
|
69
|
+
beta = Math.log( weighted_position_exponent(pos) )
|
70
|
+
pwm_pos = pos.each_index.map do |index|
|
71
|
+
(@count + actual_pseudocount) * @background.frequencies[index] * Math.exp( pos[index] ) * Math.exp( -beta ) - actual_pseudocount * @background.frequencies[index]
|
72
|
+
end
|
73
|
+
pwm_pos
|
74
|
+
end
|
75
|
+
|
76
|
+
pcm = MotifModel::PCM.new(matrix)
|
77
|
+
if pwm.respond_to? :name
|
78
|
+
pcm.named(pwm.name)
|
79
|
+
else
|
80
|
+
pcm
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -1,11 +1,5 @@
|
|
1
|
-
require_relative '
|
2
|
-
|
1
|
+
require_relative 'data_models/named_model'
|
3
2
|
require_relative 'data_models/pm'
|
4
3
|
require_relative 'data_models/pcm'
|
5
4
|
require_relative 'data_models/ppm'
|
6
5
|
require_relative 'data_models/pwm'
|
7
|
-
|
8
|
-
require_relative 'data_models/collection'
|
9
|
-
|
10
|
-
#require_relative 'bioinform/data_models/iupac_word'
|
11
|
-
#require_relative 'bioinform/data_models/iupac_wordset'
|