bioinform 0.1.17 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/LICENSE +0 -1
  4. data/README.md +1 -1
  5. data/TODO.txt +23 -30
  6. data/bin/convert_motif +4 -0
  7. data/bin/pcm2pwm +1 -1
  8. data/bin/split_motifs +1 -1
  9. data/bioinform.gemspec +0 -2
  10. data/lib/bioinform.rb +54 -16
  11. data/lib/bioinform/alphabet.rb +85 -0
  12. data/lib/bioinform/background.rb +90 -0
  13. data/lib/bioinform/cli.rb +1 -2
  14. data/lib/bioinform/cli/convert_motif.rb +52 -17
  15. data/lib/bioinform/cli/pcm2pwm.rb +32 -26
  16. data/lib/bioinform/cli/split_motifs.rb +31 -30
  17. data/lib/bioinform/conversion_algorithms.rb +6 -0
  18. data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
  19. data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
  20. data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
  21. data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
  22. data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
  23. data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
  24. data/lib/bioinform/data_models.rb +1 -7
  25. data/lib/bioinform/data_models/named_model.rb +38 -0
  26. data/lib/bioinform/data_models/pcm.rb +18 -28
  27. data/lib/bioinform/data_models/pm.rb +73 -170
  28. data/lib/bioinform/data_models/ppm.rb +11 -24
  29. data/lib/bioinform/data_models/pwm.rb +30 -56
  30. data/lib/bioinform/errors.rb +17 -0
  31. data/lib/bioinform/formatters.rb +4 -2
  32. data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
  33. data/lib/bioinform/formatters/motif_formatter.rb +69 -0
  34. data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
  35. data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
  36. data/lib/bioinform/parsers.rb +1 -8
  37. data/lib/bioinform/parsers/matrix_parser.rb +44 -36
  38. data/lib/bioinform/parsers/motif_splitter.rb +45 -0
  39. data/lib/bioinform/support.rb +46 -14
  40. data/lib/bioinform/support/strip_doc.rb +1 -1
  41. data/lib/bioinform/version.rb +1 -1
  42. data/spec/alphabet_spec.rb +79 -0
  43. data/spec/background_spec.rb +57 -0
  44. data/spec/cli/cli_spec.rb +6 -6
  45. data/spec/cli/convert_motif_spec.rb +88 -88
  46. data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
  47. data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
  48. data/spec/cli/pcm2pwm_spec.rb +22 -23
  49. data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
  50. data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
  51. data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
  52. data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
  53. data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
  54. data/spec/cli/split_motifs_spec.rb +6 -21
  55. data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
  56. data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
  57. data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
  58. data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
  59. data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
  60. data/spec/data_models/named_model_spec.rb +41 -0
  61. data/spec/data_models/pcm_spec.rb +114 -45
  62. data/spec/data_models/pm_spec.rb +132 -333
  63. data/spec/data_models/ppm_spec.rb +47 -44
  64. data/spec/data_models/pwm_spec.rb +85 -77
  65. data/spec/fabricators/motif_formats_fabricator.rb +116 -116
  66. data/spec/formatters/consensus_formatter_spec.rb +26 -0
  67. data/spec/formatters/raw_formatter_spec.rb +169 -0
  68. data/spec/parsers/matrix_parser_spec.rb +216 -0
  69. data/spec/parsers/motif_splitter_spec.rb +87 -0
  70. data/spec/spec_helper.rb +2 -2
  71. data/spec/spec_helper_source.rb +25 -5
  72. data/spec/support_spec.rb +31 -0
  73. metadata +43 -124
  74. data/bin/merge_into_collection +0 -4
  75. data/lib/bioinform/cli/merge_into_collection.rb +0 -80
  76. data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
  77. data/lib/bioinform/data_models/collection.rb +0 -75
  78. data/lib/bioinform/data_models/motif.rb +0 -56
  79. data/lib/bioinform/formatters/raw_formatter.rb +0 -41
  80. data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
  81. data/lib/bioinform/parsers/parser.rb +0 -92
  82. data/lib/bioinform/parsers/splittable_parser.rb +0 -57
  83. data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
  84. data/lib/bioinform/parsers/string_parser.rb +0 -72
  85. data/lib/bioinform/parsers/trivial_parser.rb +0 -34
  86. data/lib/bioinform/parsers/yaml_parser.rb +0 -35
  87. data/lib/bioinform/support/advanced_scan.rb +0 -8
  88. data/lib/bioinform/support/array_product.rb +0 -6
  89. data/lib/bioinform/support/array_zip.rb +0 -6
  90. data/lib/bioinform/support/collect_hash.rb +0 -7
  91. data/lib/bioinform/support/deep_dup.rb +0 -5
  92. data/lib/bioinform/support/delete_many.rb +0 -14
  93. data/lib/bioinform/support/inverf.rb +0 -13
  94. data/lib/bioinform/support/multiline_squish.rb +0 -6
  95. data/lib/bioinform/support/parameters.rb +0 -28
  96. data/lib/bioinform/support/partial_sums.rb +0 -16
  97. data/lib/bioinform/support/same_by.rb +0 -12
  98. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
  99. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
  100. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
  101. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
  102. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
  103. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
  104. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
  105. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
  106. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
  107. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
  108. data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
  109. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
  110. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
  111. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
  112. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
  113. data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
  114. data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
  115. data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
  116. data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
  117. data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
  118. data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
  119. data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
  120. data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
  121. data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
  122. data/spec/cli/data/split_motifs/collection.yaml +0 -188
  123. data/spec/cli/merge_into_collection_spec.rb +0 -100
  124. data/spec/data_models/collection_spec.rb +0 -98
  125. data/spec/data_models/motif_spec.rb +0 -224
  126. data/spec/fabricators/collection_fabricator.rb +0 -8
  127. data/spec/fabricators/motif_fabricator.rb +0 -33
  128. data/spec/fabricators/pcm_fabricator.rb +0 -25
  129. data/spec/fabricators/pm_fabricator.rb +0 -52
  130. data/spec/fabricators/ppm_fabricator.rb +0 -14
  131. data/spec/fabricators/pwm_fabricator.rb +0 -16
  132. data/spec/parsers/parser_spec.rb +0 -152
  133. data/spec/parsers/string_fantom_parser_spec.rb +0 -70
  134. data/spec/parsers/string_parser_spec.rb +0 -77
  135. data/spec/parsers/trivial_parser_spec.rb +0 -64
  136. data/spec/parsers/yaml_parser_spec.rb +0 -50
  137. data/spec/support/advanced_scan_spec.rb +0 -32
  138. data/spec/support/array_product_spec.rb +0 -15
  139. data/spec/support/array_zip_spec.rb +0 -15
  140. data/spec/support/collect_hash_spec.rb +0 -15
  141. data/spec/support/delete_many_spec.rb +0 -44
  142. data/spec/support/inverf_spec.rb +0 -19
  143. data/spec/support/multiline_squish_spec.rb +0 -25
  144. data/spec/support/partial_sums_spec.rb +0 -30
  145. data/spec/support/same_by_spec.rb +0 -36
@@ -1,47 +1,53 @@
1
1
  require_relative '../../bioinform'
2
- require 'docopt'
2
+ require 'optparse'
3
3
  require 'shellwords'
4
4
 
5
5
  module Bioinform
6
- module CLI
6
+ module CLI
7
7
  module PCM2PWM
8
8
  extend Bioinform::CLI::Helpers
9
9
  def self.main(argv)
10
- doc = <<-DOCOPT
11
- PCM to PWM converter.
12
- It transforms files with PCMs into files with PWMs. Folder for resulting files to save files can be specified. Resulting PWM files have the same name as original file but have another extension (.pwm by default).
13
- When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_folder/*.pcm | pcm2pwm` (ls -b option escape spaces in filenames)
14
-
15
- Usage:
16
- pcm2pwm [options] [<pcm-files>...]
17
-
18
- Options:
19
- -h --help Show this screen.
20
- -e --extension EXT Extension of output files [default: pwm]
21
- -f --folder FOLDER Where to save output files [default: .]
22
- DOCOPT
23
-
24
- doc.gsub!(/^#{doc[/\A +/]}/,'')
25
- options = Docopt::docopt(doc, argv: argv)
10
+ options = {folder: '.', extension: 'pwm'}
11
+ opt_parser = OptionParser.new do |opts|
12
+ opts.banner = "PCM to PWM converter.\n" +
13
+ "It transforms files with PCMs into files with PWMs.\n" +
14
+ "Folder for resulting files to save files can be specified.\n" +
15
+ "Resulting PWM files have the same name as original file but have another extension (.pwm by default).\n" +
16
+ "When filelist is empty, it's obtained from STDIN.\n" +
17
+ "One can use it: `ls -b pcm_folder/*.pcm | pcm2pwm` (ls -b option escape spaces in filenames)\n" +
18
+ "\n" +
19
+ "Usage:\n" +
20
+ " pcm2pwm [options] [<pcm-files>...]"
21
+ opts.version = ::Bioinform::VERSION
22
+ opts.on('-e', '--extension EXT', 'Extension of output files [default: pwm]') do |v|
23
+ options[:extension] = v
24
+ end
25
+ opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
26
+ options[:folder] = v
27
+ end
28
+ end
26
29
 
27
- pcm_files = options['<pcm-files>']
28
- folder = options['--folder']
29
- extension = options['--extension']
30
+ opt_parser.parse!(argv)
31
+ pcm_files = argv
32
+ folder = options[:folder]
33
+ extension = options[:extension]
30
34
 
31
35
  Dir.mkdir(folder) unless Dir.exist?(folder)
32
36
  filelist = (pcm_files.empty?) ? $stdin.read.shellsplit : pcm_files
33
37
 
38
+ converter = ConversionAlgorithms::PCM2PWMConverter.new()
39
+
34
40
  filelist.each do |filename|
35
- pwm = Bioinform::PCM.new( File.read(filename) ).to_pwm
41
+ input = File.read(filename)
42
+ motif_data = MatrixParser.new.parse(input)
43
+ pcm = MotifModel::PCM.new(motif_data[:matrix]).named(motif_data[:name])
44
+ pwm = converter.convert(pcm)
36
45
  File.open(change_folder_and_extension(filename, extension, folder), 'w') do |f|
37
46
  f.puts pwm
38
47
  end
39
48
  end
40
-
41
- rescue Docopt::Exit => e
42
- puts e.message
43
49
  end
44
50
 
45
51
  end
46
52
  end
47
- end
53
+ end
@@ -1,47 +1,48 @@
1
1
  require_relative '../../bioinform'
2
- require 'docopt'
2
+ require 'optparse'
3
3
 
4
4
  module Bioinform
5
5
  module CLI
6
6
  module SplitMotifs
7
7
  extend Bioinform::CLI::Helpers
8
8
  def self.main(argv)
9
- doc = <<-DOCOPT
10
- Motif splitter.
11
- It get a file with a set of motifs and splits it into motifs according to their names.
12
-
13
- Usage:
14
- split_motifs [options] <collection-file>
15
-
16
- Options:
17
- -h --help Show this screen.
18
- -e --extension EXT Extension of output files
19
- -f --folder FOLDER Where to save output files [default: .]
20
- DOCOPT
9
+ options = {folder: '.'}
10
+ opt_parser = OptionParser.new do |opts|
11
+ opts.version = ::Bioinform::VERSION
12
+ opts.banner = "Motif splitter.\n" +
13
+ "It gets a file with a set of motifs and splits it into motifs according to their names.\n" +
14
+ "\n" +
15
+ "Usage:\n" +
16
+ " split_motifs [options] <collection-file>"
17
+ opts.on('-e', '--extension EXT', 'Extension of output files') do |v|
18
+ options[:extension] = v
19
+ end
20
+ opts.on('-f', '--folder FOLDER', 'Where to save output files') do |v|
21
+ options[:folder] = v
22
+ end
23
+ end
21
24
 
22
- doc.gsub!(/^#{doc[/\A +/]}/,'')
23
- options = Docopt::docopt(doc, argv: argv)
25
+ opt_parser.parse!(argv)
26
+ folder = options[:folder]
27
+ extension = options[:extension]
28
+ collection_filename = argv.first
24
29
 
25
- folder = options['--folder']
26
- extension = options['--extension']
27
- collection_filename = options['<collection-file>']
28
30
 
29
31
  Dir.mkdir(folder) unless Dir.exist?(folder)
30
- raise "File #{collection_filename} not exist" unless File.exist? collection_filename
32
+ raise "Collection file not specified" unless collection_filename
33
+ raise "File `#{collection_filename}` not exist" unless File.exist?(collection_filename)
31
34
 
32
- input = File.read(collection_filename)
33
- Parser.choose(input).split.each do |motif|
34
- if motif.is_a? PM
35
- File.open(set_folder(folder, set_extension(motif.name, extension || motif.class.name.gsub(/^.*::/,'').downcase)), 'w'){|f| f.puts motif}
36
- else
37
- motif = PM.new(motif)
38
- File.open(set_folder(folder, set_extension(motif.name, extension || 'mat')), 'w'){|f| f.puts motif}
39
- end
35
+ motif_list_string = File.read(collection_filename)
36
+ coll = MotifSplitter.new.split(motif_list_string).map do |motif_string|
37
+ motif_info = MatrixParser.new.parse(motif_string)
38
+ MotifModel::PM.new(motif_info[:matrix]).named(motif_info[:name])
39
+ end
40
+
41
+ coll.each do |motif|
42
+ File.open(set_folder(folder, set_extension(motif.name, extension || 'mat')), 'w'){|f| f.puts motif }
40
43
  end
41
- rescue Docopt::Exit => e
42
- puts e.message
43
44
  end
44
45
 
45
46
  end
46
47
  end
47
- end
48
+ end
@@ -0,0 +1,6 @@
1
+ require_relative 'conversion_algorithms/pcm2ppm_converter'
2
+ require_relative 'conversion_algorithms/pcm2pwm_converter'
3
+ require_relative 'conversion_algorithms/ppm2pcm_converter'
4
+ require_relative 'conversion_algorithms/pwm2pcm_converter'
5
+ require_relative 'conversion_algorithms/pwm2iupac_pwm_converter'
6
+ require_relative 'conversion_algorithms/pcm2pwm_mara_converter'
@@ -1,19 +1,21 @@
1
+ require_relative '../data_models'
2
+
1
3
  module Bioinform
2
4
  module ConversionAlgorithms
3
- module PCM2PPMConverter
4
-
5
- # parameters hash is ignored
6
- def self.convert(pcm, parameters = {})
5
+ class PCM2PPMConverter
6
+ def convert(pcm)
7
+ raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
7
8
  matrix = pcm.each_position.map do |pos|
8
- pos.map do |el|
9
- el.to_f / pcm.count
10
- end
9
+ count = pos.inject(0.0, &:+)
10
+ pos.map {|el| el / count }
11
+ end
12
+ ppm = MotifModel::PPM.new(matrix)
13
+ if pcm.respond_to? :name
14
+ ppm.named(pcm.name)
15
+ else
16
+ ppm
11
17
  end
12
- PPM.new(pcm.get_parameters.merge(matrix: matrix))
13
18
  end
14
19
  end
15
20
  end
16
21
  end
17
-
18
-
19
-
@@ -1,20 +1,48 @@
1
+ require_relative '../errors'
2
+ require_relative '../data_models'
3
+ require_relative '../background'
4
+
1
5
  module Bioinform
2
6
  module ConversionAlgorithms
3
- module PCM2PWMConverter
4
- def self.convert(pcm, parameters = {})
5
- default_parameters = {pseudocount: Math.log(pcm.count),
6
- probability: (pcm.probability || [0.25, 0.25, 0.25, 0.25])
7
- }
8
- parameters = default_parameters.merge(parameters)
9
- probability = parameters[:probability]
10
- pseudocount = parameters[:pseudocount]
7
+ # s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
8
+ class PCM2PWMConverter
9
+ attr_reader :background, :pseudocount
10
+ def initialize(options = {})
11
+ @background = options.fetch(:background, Bioinform::Background::Uniform)
12
+ @pseudocount = options.fetch(:pseudocount, :log)
13
+ end
14
+
15
+ def calculate_pseudocount(pcm)
16
+ case @pseudocount
17
+ when Numeric
18
+ @pseudocount
19
+ when :log
20
+ Math.log(pcm.count)
21
+ when :sqrt
22
+ Math.sqrt(pcm.count)
23
+ when Proc
24
+ @pseudocount.call(pcm)
25
+ else
26
+ raise Error, 'Unknown pseudocount type use numeric or :log or :sqrt or Proc with taking pcm parameter'
27
+ end
28
+ end
29
+
30
+ def convert(pcm)
31
+ raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
32
+ actual_pseudocount = calculate_pseudocount(pcm)
11
33
  matrix = pcm.each_position.map do |pos|
34
+ count = pos.inject(0.0, &:+)
12
35
  pos.each_index.map do |index|
13
- Math.log((pos[index] + probability[index] * pseudocount) / (probability[index]*(pcm.count + pseudocount)) )
36
+ Math.log((pos[index] + @background.frequencies[index] * actual_pseudocount).to_f / (@background.frequencies[index]*(count + actual_pseudocount)) )
14
37
  end
15
38
  end
16
- PWM.new(pcm.get_parameters.merge(matrix: matrix))
39
+ pwm = MotifModel::PWM.new(matrix)
40
+ if pcm.respond_to? :name
41
+ pwm.named(pcm.name)
42
+ else
43
+ pwm
44
+ end
17
45
  end
18
46
  end
19
47
  end
20
- end
48
+ end
@@ -0,0 +1,26 @@
1
+ require_relative '../errors'
2
+ require_relative '../data_models'
3
+ require_relative '../background'
4
+
5
+ module Bioinform
6
+ module ConversionAlgorithms
7
+ # s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}})
8
+ class MaraPCM2PWMConverter
9
+ def convert(pcm)
10
+ raise Error, "#{self.class}#convert accepts only models acting as PCM" unless MotifModel.acts_as_pcm?(pcm)
11
+ matrix = pcm.each_position.map do |pos|
12
+ count = pos.inject(0.0, &:+)
13
+ pos.each_index.map do |index|
14
+ Math.log((pos[index] + 0.5).to_f / (0.25 * (count + 2)) )
15
+ end
16
+ end
17
+ pwm = MotifModel::PWM.new(matrix)
18
+ if pcm.respond_to? :name
19
+ pwm.named(pcm.name)
20
+ else
21
+ pwm
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,30 @@
1
+ require_relative '../errors'
2
+ require_relative '../data_models'
3
+
4
+ module Bioinform
5
+ module ConversionAlgorithms
6
+ class PPM2PCMConverter
7
+ attr_reader :count
8
+
9
+ def initialize(options = {})
10
+ @count = options.fetch(:count, 100)
11
+ end
12
+
13
+ def convert(ppm)
14
+ raise Error, "#{self.class}#convert accepts only models acting as PPM" unless MotifModel.acts_as_ppm?(ppm)
15
+ matrix = ppm.each_position.map do |pos|
16
+ pos.map do |el|
17
+ el * @count
18
+ end
19
+ end
20
+
21
+ pcm = MotifModel::PCM.new(matrix)
22
+ if ppm.respond_to? :name
23
+ pcm.named(ppm.name)
24
+ else
25
+ pcm
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ require_relative '../alphabet'
2
+
3
+ module Bioinform
4
+ module ConversionAlgorithms
5
+ class PWM2IupacPWMConverter
6
+ attr_reader :iupac_alphabet
7
+ def initialize(options = {})
8
+ @iupac_alphabet = options.fetch(:alphabet, NucleotideAlphabetWithN)
9
+ end
10
+ def convert(pwm)
11
+ raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
12
+ raise Error, 'this conversion is possible only for ACGT-nucleotide motifs' unless pwm.alphabet == NucleotideAlphabet
13
+ iupac_matrix = pwm.each_position.map do |pos|
14
+ @iupac_alphabet.each_letter.map do |letter|
15
+ nucleotide_indices = IUPAC::NucleotideIndicesByIUPACLetter[letter]
16
+ nucleotide_indices.inject(0.0){|sum, nucleotide_index| sum + pos[nucleotide_index] } / nucleotide_indices.size
17
+ end
18
+ end
19
+ MotifModel::PWM.new(iupac_matrix, alphabet: @iupac_alphabet)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,85 @@
1
+ require_relative '../data_models'
2
+
3
+ module Bioinform
4
+ module ConversionAlgorithms
5
+
6
+ # This algorithm is a purely heuristic based on our algorithm of PWM calculation.
7
+ # pcm --> pwm:
8
+ # s_{\alpha,j} = ln(\frac{x_{\alpha,j} + \cappa p_{\alpha}}{(N+\cappa)p_{\alpha}}) - \beta_{j}
9
+ # \beta_j is an arbitrary constant
10
+ # Hence
11
+ # pwm --> pcm:
12
+ # x_{\alpha,j} = (N + \cappa) p_{\alpha} \exp{ s_{\alpha,j} - \beta_j } - \cappa p_{\alpha}
13
+ # \beta_j = log(\sum_{\alpha}p_{\alpha}s_{\alpha,j}) because \sum_{\alpha} x_{\alpha,j} = N
14
+ class PWM2PCMConverter
15
+ attr_reader :pseudocount, :count, :background
16
+
17
+ def initialize(options = {})
18
+ @pseudocount = options.fetch(:pseudocount, :default)
19
+ @count = options.fetch(:count, 100.0)
20
+ @background = options.fetch(:background, Bioinform::Background::Uniform)
21
+ end
22
+
23
+ def calculate_pseudocount(pwm)
24
+ case @pseudocount
25
+ when Numeric
26
+ @pseudocount
27
+ when :default
28
+ # *0.95 is to guarantee that rounding errors won't exceed real max pseudocount and generate PCM with negative elements
29
+ max_pseudocount = max_pseudocount_fraction(pwm) * @count
30
+ (Math.log(@count) <= max_pseudocount*0.95) ? Math.log(@count) : max_pseudocount * 0.95
31
+ when Proc
32
+ @pseudocount.call(pwm)
33
+ else
34
+ raise Error, 'Unknown pseudocount type use numeric or :default or Proc with taking pcm parameter'
35
+ end
36
+ end
37
+
38
+ # \sum p_{\alpha} s_{\alpha,j}
39
+ def weighted_position_exponent(pos)
40
+ pos.each_with_index.map {|elem, letter_index| @background.frequencies[letter_index] * Math.exp(elem) }.inject(0.0, &:+)
41
+ end
42
+
43
+ # possible (pseudocount / count) range is from 0 to max_pseudocount_fraction
44
+ # it's derived from
45
+ # (-\exp{s_{\alpha,j}} + \sum_{\alpha} p_{\alpha,j}\exp{s_{\alpha,j}}) * pseudocount < count * \exp{s_{\alpha,j}}
46
+ # which is derived from the fact that each element of PCM should be not less than 0
47
+ def max_pseudocount_fraction(pwm)
48
+ # min = 0.0
49
+ max = Float::INFINITY
50
+ pwm.each_position do |pos|
51
+ pos.each_with_index do |elem, letter_index|
52
+ coeff = weighted_position_exponent(pos) - Math.exp(elem)
53
+ if coeff > 0
54
+ max = [Math.exp(elem) / coeff, max].min
55
+ # elsif coeff < 0
56
+ # min = [Math.exp(elem) / coeff, min].max # Math.exp(elem) / coeff is always < 0 hence minimal pseudocount is zero
57
+ end
58
+ end
59
+ end
60
+ max
61
+ end
62
+
63
+ private :max_pseudocount_fraction, :weighted_position_exponent
64
+
65
+ def convert(pwm)
66
+ raise Error, "Can convert only PWMs" unless MotifModel.acts_as_pwm?(pwm)
67
+ actual_pseudocount = calculate_pseudocount(pwm)
68
+ matrix = pwm.each_position.map do |pos|
69
+ beta = Math.log( weighted_position_exponent(pos) )
70
+ pwm_pos = pos.each_index.map do |index|
71
+ (@count + actual_pseudocount) * @background.frequencies[index] * Math.exp( pos[index] ) * Math.exp( -beta ) - actual_pseudocount * @background.frequencies[index]
72
+ end
73
+ pwm_pos
74
+ end
75
+
76
+ pcm = MotifModel::PCM.new(matrix)
77
+ if pwm.respond_to? :name
78
+ pcm.named(pwm.name)
79
+ else
80
+ pcm
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -1,11 +1,5 @@
1
- require_relative 'parsers'
2
-
1
+ require_relative 'data_models/named_model'
3
2
  require_relative 'data_models/pm'
4
3
  require_relative 'data_models/pcm'
5
4
  require_relative 'data_models/ppm'
6
5
  require_relative 'data_models/pwm'
7
-
8
- require_relative 'data_models/collection'
9
-
10
- #require_relative 'bioinform/data_models/iupac_word'
11
- #require_relative 'bioinform/data_models/iupac_wordset'