bioinform 0.1.17 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
|
@@ -1,47 +1,53 @@
|
|
|
1
1
|
require_relative '../../bioinform'
|
|
2
|
-
require '
|
|
2
|
+
require 'optparse'
|
|
3
3
|
require 'shellwords'
|
|
4
4
|
|
|
5
5
|
module Bioinform
|
|
6
|
-
module CLI
|
|
6
|
+
module CLI
|
|
7
7
|
module PCM2PWM
|
|
8
8
|
extend Bioinform::CLI::Helpers
|
|
9
9
|
def self.main(argv)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
10
|
+
options = {folder: '.', extension: 'pwm'}
|
|
11
|
+
opt_parser = OptionParser.new do |opts|
|
|
12
|
+
opts.banner = "PCM to PWM converter.\n" +
|
|
13
|
+
"It transforms files with PCMs into files with PWMs.\n" +
|
|
14
|
+
"Folder for resulting files to save files can be specified.\n" +
|
|
15
|
+
"Resulting PWM files have the same name as original file but have another extension (.pwm by default).\n" +
|
|
16
|
+
"When filelist is empty, it's obtained from STDIN.\n" +
|
|
17
|
+
"One can use it: `ls -b pcm_folder/*.pcm | pcm2pwm` (ls -b option escape spaces in filenames)\n" +
|
|
18
|
+
"\n" +
|
|
19
|
+
"Usage:\n" +
|
|
20
|
+
" pcm2pwm [options] [<pcm-files>...]"
|
|
21
|
+
opts.version = ::Bioinform::VERSION
|
|
22
|
+
opts.on('-e', '--extension EXT', 'Extension of output files [default: pwm]') do |v|
|
|
23
|
+
options[:extension] = v
|
|
24
|
+
end
|
|
25
|
+
opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
|
|
26
|
+
options[:folder] = v
|
|
27
|
+
end
|
|
28
|
+
end
|
|
26
29
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
opt_parser.parse!(argv)
|
|
31
|
+
pcm_files = argv
|
|
32
|
+
folder = options[:folder]
|
|
33
|
+
extension = options[:extension]
|
|
30
34
|
|
|
31
35
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
|
32
36
|
filelist = (pcm_files.empty?) ? $stdin.read.shellsplit : pcm_files
|
|
33
37
|
|
|
38
|
+
converter = ConversionAlgorithms::PCM2PWMConverter.new()
|
|
39
|
+
|
|
34
40
|
filelist.each do |filename|
|
|
35
|
-
|
|
41
|
+
input = File.read(filename)
|
|
42
|
+
motif_data = MatrixParser.new.parse(input)
|
|
43
|
+
pcm = MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
|
|
44
|
+
pwm = converter.convert(pcm)
|
|
36
45
|
File.open(change_folder_and_extension(filename, extension, folder), 'w') do |f|
|
|
37
46
|
f.puts pwm
|
|
38
47
|
end
|
|
39
48
|
end
|
|
40
|
-
|
|
41
|
-
rescue Docopt::Exit => e
|
|
42
|
-
puts e.message
|
|
43
49
|
end
|
|
44
50
|
|
|
45
51
|
end
|
|
46
52
|
end
|
|
47
|
-
end
|
|
53
|
+
end
|
|
@@ -1,47 +1,48 @@
|
|
|
1
1
|
require_relative '../../bioinform'
|
|
2
|
-
require '
|
|
2
|
+
require 'optparse'
|
|
3
3
|
|
|
4
4
|
module Bioinform
|
|
5
5
|
module CLI
|
|
6
6
|
module SplitMotifs
|
|
7
7
|
extend Bioinform::CLI::Helpers
|
|
8
8
|
def self.main(argv)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
9
|
+
options = {folder: '.'}
|
|
10
|
+
opt_parser = OptionParser.new do |opts|
|
|
11
|
+
opts.version = ::Bioinform::VERSION
|
|
12
|
+
opts.banner = "Motif splitter.\n" +
|
|
13
|
+
"It gets a file with a set of motifs and splits it into motifs according to their names.\n" +
|
|
14
|
+
"\n" +
|
|
15
|
+
"Usage:\n" +
|
|
16
|
+
" split_motifs [options] <collection-file>"
|
|
17
|
+
opts.on('-e', '--extension EXT', 'Extension of output files') do |v|
|
|
18
|
+
options[:extension] = v
|
|
19
|
+
end
|
|
20
|
+
opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
|
|
21
|
+
options[:folder] = v
|
|
22
|
+
end
|
|
23
|
+
end
|
|
21
24
|
|
|
22
|
-
|
|
23
|
-
|
|
25
|
+
opt_parser.parse!(argv)
|
|
26
|
+
folder = options[:folder]
|
|
27
|
+
extension = options[:extension]
|
|
28
|
+
collection_filename = argv.first
|
|
24
29
|
|
|
25
|
-
folder = options['--folder']
|
|
26
|
-
extension = options['--extension']
|
|
27
|
-
collection_filename = options['<collection-file>']
|
|
28
30
|
|
|
29
31
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
|
30
|
-
raise "
|
|
32
|
+
raise "Collection file not specified" unless collection_filename
|
|
33
|
+
raise "File `#{collection_filename}` not exist" unless File.exist?(collection_filename)
|
|
31
34
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
motif_list_string = File.read(collection_filename)
|
|
36
|
+
coll = MotifSplitter.new.split(motif_list_string).map do |motif_string|
|
|
37
|
+
motif_info = MatrixParser.new.parse(motif_string)
|
|
38
|
+
MotifModel::PM.new(motif_info[:matrix]).named(motif_info[:name])
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
coll.each do |motif|
|
|
42
|
+
File.open(set_folder(folder, set_extension(motif.name, extension || 'mat')), 'w'){|f| f.puts motif }
|
|
40
43
|
end
|
|
41
|
-
rescue Docopt::Exit => e
|
|
42
|
-
puts e.message
|
|
43
44
|
end
|
|
44
45
|
|
|
45
46
|
end
|
|
46
47
|
end
|
|
47
|
-
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
require_relative 'conversion_algorithms/pcm2ppm_converter'
|
|
2
|
+
require_relative 'conversion_algorithms/pcm2pwm_converter'
|
|
3
|
+
require_relative 'conversion_algorithms/ppm2pcm_converter'
|
|
4
|
+
require_relative 'conversion_algorithms/pwm2pcm_converter'
|
|
5
|
+
require_relative 'conversion_algorithms/pwm2iupac_pwm_converter'
|
|
6
|
+
require_relative 'conversion_algorithms/pcm2pwm_mara_converter'
|
|
@@ -1,19 +1,21 @@
|
|
|
1
|
+
require_relative '../data_models'
|
|
2
|
+
|
|
1
3
|
module Bioinform
|
|
2
4
|
module ConversionAlgorithms
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def self.convert(pcm, parameters = {})
|
|
5
|
+
class PCM2PPMConverter
|
|
6
|
+
def convert(pcm)
|
|
7
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
|
7
8
|
matrix = pcm.each_position.map do |pos|
|
|
8
|
-
pos.
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
count = pos.inject(0.0, &:+)
|
|
10
|
+
pos.map {|el| el / count }
|
|
11
|
+
end
|
|
12
|
+
ppm = MotifModel::PPM.new(matrix)
|
|
13
|
+
if pcm.respond_to? :name
|
|
14
|
+
ppm.named(pcm.name)
|
|
15
|
+
else
|
|
16
|
+
ppm
|
|
11
17
|
end
|
|
12
|
-
PPM.new(pcm.get_parameters.merge(matrix: matrix))
|
|
13
18
|
end
|
|
14
19
|
end
|
|
15
20
|
end
|
|
16
21
|
end
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
@@ -1,20 +1,48 @@
|
|
|
1
|
+
require_relative '../errors'
|
|
2
|
+
require_relative '../data_models'
|
|
3
|
+
require_relative '../background'
|
|
4
|
+
|
|
1
5
|
module Bioinform
|
|
2
6
|
module ConversionAlgorithms
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
|
|
8
|
+
class PCM2PWMConverter
|
|
9
|
+
attr_reader :background, :pseudocount
|
|
10
|
+
def initialize(options = {})
|
|
11
|
+
@background = options.fetch(:background, Bioinform::Background::Uniform)
|
|
12
|
+
@pseudocount = options.fetch(:pseudocount, :log)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def calculate_pseudocount(pcm)
|
|
16
|
+
case @pseudocount
|
|
17
|
+
when Numeric
|
|
18
|
+
@pseudocount
|
|
19
|
+
when :log
|
|
20
|
+
Math.log(pcm.count)
|
|
21
|
+
when :sqrt
|
|
22
|
+
Math.sqrt(pcm.count)
|
|
23
|
+
when Proc
|
|
24
|
+
@pseudocount.call(pcm)
|
|
25
|
+
else
|
|
26
|
+
raise Error, 'Unknown pseudocount type use numeric or :log or :sqrt or Proc with taking pcm parameter'
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def convert(pcm)
|
|
31
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
|
32
|
+
actual_pseudocount = calculate_pseudocount(pcm)
|
|
11
33
|
matrix = pcm.each_position.map do |pos|
|
|
34
|
+
count = pos.inject(0.0, &:+)
|
|
12
35
|
pos.each_index.map do |index|
|
|
13
|
-
Math.log((pos[index] +
|
|
36
|
+
Math.log((pos[index] + @background.frequencies[index] * actual_pseudocount).to_f / (@background.frequencies[index]*(count + actual_pseudocount)) )
|
|
14
37
|
end
|
|
15
38
|
end
|
|
16
|
-
PWM.new(
|
|
39
|
+
pwm = MotifModel::PWM.new(matrix)
|
|
40
|
+
if pcm.respond_to? :name
|
|
41
|
+
pwm.named(pcm.name)
|
|
42
|
+
else
|
|
43
|
+
pwm
|
|
44
|
+
end
|
|
17
45
|
end
|
|
18
46
|
end
|
|
19
47
|
end
|
|
20
|
-
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require_relative '../errors'
|
|
2
|
+
require_relative '../data_models'
|
|
3
|
+
require_relative '../background'
|
|
4
|
+
|
|
5
|
+
module Bioinform
|
|
6
|
+
module ConversionAlgorithms
|
|
7
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
|
|
8
|
+
class MaraPCM2PWMConverter
|
|
9
|
+
def convert(pcm)
|
|
10
|
+
raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
|
|
11
|
+
matrix = pcm.each_position.map do |pos|
|
|
12
|
+
count = pos.inject(0.0, &:+)
|
|
13
|
+
pos.each_index.map do |index|
|
|
14
|
+
Math.log((pos[index] + 0.5).to_f / (0.25 * (count + 2)) )
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
pwm = MotifModel::PWM.new(matrix)
|
|
18
|
+
if pcm.respond_to? :name
|
|
19
|
+
pwm.named(pcm.name)
|
|
20
|
+
else
|
|
21
|
+
pwm
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require_relative '../errors'
|
|
2
|
+
require_relative '../data_models'
|
|
3
|
+
|
|
4
|
+
module Bioinform
|
|
5
|
+
module ConversionAlgorithms
|
|
6
|
+
class PPM2PCMConverter
|
|
7
|
+
attr_reader :count
|
|
8
|
+
|
|
9
|
+
def initialize(options = {})
|
|
10
|
+
@count = options.fetch(:count, 100)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def convert(ppm)
|
|
14
|
+
raise Error, "#{self.class}#convert accepts only models acting as PPM" unless MotifModel.acts_as_ppm?(ppm)
|
|
15
|
+
matrix = ppm.each_position.map do |pos|
|
|
16
|
+
pos.map do |el|
|
|
17
|
+
el * @count
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
pcm = MotifModel::PCM.new(matrix)
|
|
22
|
+
if ppm.respond_to? :name
|
|
23
|
+
pcm.named(ppm.name)
|
|
24
|
+
else
|
|
25
|
+
pcm
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require_relative '../alphabet'
|
|
2
|
+
|
|
3
|
+
module Bioinform
|
|
4
|
+
module ConversionAlgorithms
|
|
5
|
+
class PWM2IupacPWMConverter
|
|
6
|
+
attr_reader :iupac_alphabet
|
|
7
|
+
def initialize(options = {})
|
|
8
|
+
@iupac_alphabet = options.fetch(:alphabet, NucleotideAlphabetWithN)
|
|
9
|
+
end
|
|
10
|
+
def convert(pwm)
|
|
11
|
+
raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
|
|
12
|
+
raise Error, 'this conversion is possible only for ACGT-nucleotide motifs' unless pwm.alphabet == NucleotideAlphabet
|
|
13
|
+
iupac_matrix = pwm.each_position.map do |pos|
|
|
14
|
+
@iupac_alphabet.each_letter.map do |letter|
|
|
15
|
+
nucleotide_indices = IUPAC::NucleotideIndicesByIUPACLetter[letter]
|
|
16
|
+
nucleotide_indices.inject(0.0){|sum, nucleotide_index| sum + pos[nucleotide_index] } / nucleotide_indices.size
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
MotifModel::PWM.new(iupac_matrix, alphabet: @iupac_alphabet)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require_relative '../data_models'
|
|
2
|
+
|
|
3
|
+
module Bioinform
|
|
4
|
+
module ConversionAlgorithms
|
|
5
|
+
|
|
6
|
+
# This algorithm is a purely heuristic based on our algorithm of PWM calculation.
|
|
7
|
+
# pcm --> pwm:
|
|
8
|
+
# s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}}) - \beta_{j}
|
|
9
|
+
# \beta_j is an arbitrary constant
|
|
10
|
+
# Hence
|
|
11
|
+
# pwm --> pcm:
|
|
12
|
+
# x_{\alpha,j} = (N + \cappa) p_{\alpha} \exp{ s_{\alpha,j} - \beta_j } - \cappa p_{\alpha}
|
|
13
|
+
# \beta_j = log(\sum_{\alpha}p_{\alpha}s_{\alpha,j}) because \sum_{\alpha} x_{\alpha,j} = N
|
|
14
|
+
class PWM2PCMConverter
|
|
15
|
+
attr_reader :pseudocount, :count, :background
|
|
16
|
+
|
|
17
|
+
def initialize(options = {})
|
|
18
|
+
@pseudocount = options.fetch(:pseudocount, :default)
|
|
19
|
+
@count = options.fetch(:count, 100.0)
|
|
20
|
+
@background = options.fetch(:background, Bioinform::Background::Uniform)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def calculate_pseudocount(pwm)
|
|
24
|
+
case @pseudocount
|
|
25
|
+
when Numeric
|
|
26
|
+
@pseudocount
|
|
27
|
+
when :default
|
|
28
|
+
# *0.95 is to guarantee that rounding errors won't exceed real max pseudocount and generate PCM with negative elements
|
|
29
|
+
max_pseudocount = max_pseudocount_fraction(pwm) * @count
|
|
30
|
+
(Math.log(@count) <= max_pseudocount*0.95) ? Math.log(@count) : max_pseudocount * 0.95
|
|
31
|
+
when Proc
|
|
32
|
+
@pseudocount.call(pwm)
|
|
33
|
+
else
|
|
34
|
+
raise Error, 'Unknown pseudocount type use numeric or :default or Proc with taking pcm parameter'
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# \sum p_{\alpha} s_{\alpha,j}
|
|
39
|
+
def weighted_position_exponent(pos)
|
|
40
|
+
pos.each_with_index.map {|elem, letter_index| @background.frequencies[letter_index] * Math.exp(elem) }.inject(0.0, &:+)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# possible (pseudocount / count) range is from 0 to max_pseudocount_fraction
|
|
44
|
+
# it's derived from
|
|
45
|
+
# (-\exp{s_{\alpha,j}} + \sum_{\alpha} p_{\alpha,j}\exp{s_{\alpha,j}}) * pseudocount < count * \exp{s_{\alpha,j}}
|
|
46
|
+
# which is derived from the fact that each element of PCM should be not less than 0
|
|
47
|
+
def max_pseudocount_fraction(pwm)
|
|
48
|
+
# min = 0.0
|
|
49
|
+
max = Float::INFINITY
|
|
50
|
+
pwm.each_position do |pos|
|
|
51
|
+
pos.each_with_index do |elem, letter_index|
|
|
52
|
+
coeff = weighted_position_exponent(pos) - Math.exp(elem)
|
|
53
|
+
if coeff > 0
|
|
54
|
+
max = [Math.exp(elem) / coeff, max].min
|
|
55
|
+
# elsif coeff < 0
|
|
56
|
+
# min = [Math.exp(elem) / coeff, min].max # Math.exp(elem) / coeff is always < 0 hence minimal pseudocount is zero
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
max
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private :max_pseudocount_fraction, :weighted_position_exponent
|
|
64
|
+
|
|
65
|
+
def convert(pwm)
|
|
66
|
+
raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
|
|
67
|
+
actual_pseudocount = calculate_pseudocount(pwm)
|
|
68
|
+
matrix = pwm.each_position.map do |pos|
|
|
69
|
+
beta = Math.log( weighted_position_exponent(pos) )
|
|
70
|
+
pwm_pos = pos.each_index.map do |index|
|
|
71
|
+
(@count + actual_pseudocount) * @background.frequencies[index] * Math.exp( pos[index] ) * Math.exp( -beta ) - actual_pseudocount * @background.frequencies[index]
|
|
72
|
+
end
|
|
73
|
+
pwm_pos
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
pcm = MotifModel::PCM.new(matrix)
|
|
77
|
+
if pwm.respond_to? :name
|
|
78
|
+
pcm.named(pwm.name)
|
|
79
|
+
else
|
|
80
|
+
pcm
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -1,11 +1,5 @@
|
|
|
1
|
-
require_relative '
|
|
2
|
-
|
|
1
|
+
require_relative 'data_models/named_model'
|
|
3
2
|
require_relative 'data_models/pm'
|
|
4
3
|
require_relative 'data_models/pcm'
|
|
5
4
|
require_relative 'data_models/ppm'
|
|
6
5
|
require_relative 'data_models/pwm'
|
|
7
|
-
|
|
8
|
-
require_relative 'data_models/collection'
|
|
9
|
-
|
|
10
|
-
#require_relative 'bioinform/data_models/iupac_word'
|
|
11
|
-
#require_relative 'bioinform/data_models/iupac_wordset'
|