bioinform 0.1.17 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/LICENSE +0 -1
  4. data/README.md +1 -1
  5. data/TODO.txt +23 -30
  6. data/bin/convert_motif +4 -0
  7. data/bin/pcm2pwm +1 -1
  8. data/bin/split_motifs +1 -1
  9. data/bioinform.gemspec +0 -2
  10. data/lib/bioinform.rb +54 -16
  11. data/lib/bioinform/alphabet.rb +85 -0
  12. data/lib/bioinform/background.rb +90 -0
  13. data/lib/bioinform/cli.rb +1 -2
  14. data/lib/bioinform/cli/convert_motif.rb +52 -17
  15. data/lib/bioinform/cli/pcm2pwm.rb +32 -26
  16. data/lib/bioinform/cli/split_motifs.rb +31 -30
  17. data/lib/bioinform/conversion_algorithms.rb +6 -0
  18. data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
  19. data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
  20. data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
  21. data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
  22. data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
  23. data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
  24. data/lib/bioinform/data_models.rb +1 -7
  25. data/lib/bioinform/data_models/named_model.rb +38 -0
  26. data/lib/bioinform/data_models/pcm.rb +18 -28
  27. data/lib/bioinform/data_models/pm.rb +73 -170
  28. data/lib/bioinform/data_models/ppm.rb +11 -24
  29. data/lib/bioinform/data_models/pwm.rb +30 -56
  30. data/lib/bioinform/errors.rb +17 -0
  31. data/lib/bioinform/formatters.rb +4 -2
  32. data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
  33. data/lib/bioinform/formatters/motif_formatter.rb +69 -0
  34. data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
  35. data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
  36. data/lib/bioinform/parsers.rb +1 -8
  37. data/lib/bioinform/parsers/matrix_parser.rb +44 -36
  38. data/lib/bioinform/parsers/motif_splitter.rb +45 -0
  39. data/lib/bioinform/support.rb +46 -14
  40. data/lib/bioinform/support/strip_doc.rb +1 -1
  41. data/lib/bioinform/version.rb +1 -1
  42. data/spec/alphabet_spec.rb +79 -0
  43. data/spec/background_spec.rb +57 -0
  44. data/spec/cli/cli_spec.rb +6 -6
  45. data/spec/cli/convert_motif_spec.rb +88 -88
  46. data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
  47. data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
  48. data/spec/cli/pcm2pwm_spec.rb +22 -23
  49. data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
  50. data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
  51. data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
  52. data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
  53. data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
  54. data/spec/cli/split_motifs_spec.rb +6 -21
  55. data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
  56. data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
  57. data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
  58. data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
  59. data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
  60. data/spec/data_models/named_model_spec.rb +41 -0
  61. data/spec/data_models/pcm_spec.rb +114 -45
  62. data/spec/data_models/pm_spec.rb +132 -333
  63. data/spec/data_models/ppm_spec.rb +47 -44
  64. data/spec/data_models/pwm_spec.rb +85 -77
  65. data/spec/fabricators/motif_formats_fabricator.rb +116 -116
  66. data/spec/formatters/consensus_formatter_spec.rb +26 -0
  67. data/spec/formatters/raw_formatter_spec.rb +169 -0
  68. data/spec/parsers/matrix_parser_spec.rb +216 -0
  69. data/spec/parsers/motif_splitter_spec.rb +87 -0
  70. data/spec/spec_helper.rb +2 -2
  71. data/spec/spec_helper_source.rb +25 -5
  72. data/spec/support_spec.rb +31 -0
  73. metadata +43 -124
  74. data/bin/merge_into_collection +0 -4
  75. data/lib/bioinform/cli/merge_into_collection.rb +0 -80
  76. data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
  77. data/lib/bioinform/data_models/collection.rb +0 -75
  78. data/lib/bioinform/data_models/motif.rb +0 -56
  79. data/lib/bioinform/formatters/raw_formatter.rb +0 -41
  80. data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
  81. data/lib/bioinform/parsers/parser.rb +0 -92
  82. data/lib/bioinform/parsers/splittable_parser.rb +0 -57
  83. data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
  84. data/lib/bioinform/parsers/string_parser.rb +0 -72
  85. data/lib/bioinform/parsers/trivial_parser.rb +0 -34
  86. data/lib/bioinform/parsers/yaml_parser.rb +0 -35
  87. data/lib/bioinform/support/advanced_scan.rb +0 -8
  88. data/lib/bioinform/support/array_product.rb +0 -6
  89. data/lib/bioinform/support/array_zip.rb +0 -6
  90. data/lib/bioinform/support/collect_hash.rb +0 -7
  91. data/lib/bioinform/support/deep_dup.rb +0 -5
  92. data/lib/bioinform/support/delete_many.rb +0 -14
  93. data/lib/bioinform/support/inverf.rb +0 -13
  94. data/lib/bioinform/support/multiline_squish.rb +0 -6
  95. data/lib/bioinform/support/parameters.rb +0 -28
  96. data/lib/bioinform/support/partial_sums.rb +0 -16
  97. data/lib/bioinform/support/same_by.rb +0 -12
  98. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
  99. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
  100. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
  101. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
  102. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
  103. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
  104. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
  105. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
  106. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
  107. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
  108. data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
  109. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
  110. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
  111. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
  112. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
  113. data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
  114. data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
  115. data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
  116. data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
  117. data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
  118. data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
  119. data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
  120. data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
  121. data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
  122. data/spec/cli/data/split_motifs/collection.yaml +0 -188
  123. data/spec/cli/merge_into_collection_spec.rb +0 -100
  124. data/spec/data_models/collection_spec.rb +0 -98
  125. data/spec/data_models/motif_spec.rb +0 -224
  126. data/spec/fabricators/collection_fabricator.rb +0 -8
  127. data/spec/fabricators/motif_fabricator.rb +0 -33
  128. data/spec/fabricators/pcm_fabricator.rb +0 -25
  129. data/spec/fabricators/pm_fabricator.rb +0 -52
  130. data/spec/fabricators/ppm_fabricator.rb +0 -14
  131. data/spec/fabricators/pwm_fabricator.rb +0 -16
  132. data/spec/parsers/parser_spec.rb +0 -152
  133. data/spec/parsers/string_fantom_parser_spec.rb +0 -70
  134. data/spec/parsers/string_parser_spec.rb +0 -77
  135. data/spec/parsers/trivial_parser_spec.rb +0 -64
  136. data/spec/parsers/yaml_parser_spec.rb +0 -50
  137. data/spec/support/advanced_scan_spec.rb +0 -32
  138. data/spec/support/array_product_spec.rb +0 -15
  139. data/spec/support/array_zip_spec.rb +0 -15
  140. data/spec/support/collect_hash_spec.rb +0 -15
  141. data/spec/support/delete_many_spec.rb +0 -44
  142. data/spec/support/inverf_spec.rb +0 -19
  143. data/spec/support/multiline_squish_spec.rb +0 -25
  144. data/spec/support/partial_sums_spec.rb +0 -30
  145. data/spec/support/same_by_spec.rb +0 -36
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 792006e928db4e7ce443f56f93b94e9f5cbc2e95
4
- data.tar.gz: f7f4f25e156071fee5f242cc50ecdcd80741b596
3
+ metadata.gz: 41bb8dd19247a6f1b8e7643e5fbf1d0e03b823de
4
+ data.tar.gz: 7dbd3f01dbea7fe1ed3125bef775cc72e5dccf8e
5
5
  SHA512:
6
- metadata.gz: 4d0c1b14cba03745ecdf19b14579533489619f22f584e3fda64a77b4d72ba0998930b8239ceb5212b0d09ca6aa88879abc29057354cd8b4650ae35dab5845a7e
7
- data.tar.gz: 8e0a3b680e4e5ef765697abd6c0c449dc303ea75c1d886aef4c01a4a34f38a67008ac9bf2dc4f173ba0f7a3cd769cd22424670689fa727946bcc15767a5689ed
6
+ metadata.gz: a3e5c829bf134e07c7a03a56de61017ca9f3c7237e1c6a18e49d0cf57cdf1a3054c7501541070e85a756ae12231a84c6d04a6f94384b5d7dd57f93ceaf335e11
7
+ data.tar.gz: 84d64628b85fd7b5e757637de51c074baeb98ebd8053c83c4af4b77bcf7e843fca10e545b472e7fda781cc9b3fba4ccb9b8bd8055e254b5f7a1f83d1575f672f
data/Gemfile CHANGED
@@ -6,11 +6,11 @@ gemspec
6
6
 
7
7
  group :development do
8
8
  # gem 'win32console'
9
- gem 'rspec', '>= 2.0'
10
- gem 'fabrication', '~> 2.5.0'
9
+ gem 'rspec', '~> 3.0'
10
+ # gem 'fabrication', '~> 2.5.0'
11
11
  gem 'rspec-given', '>= 2.0.0'
12
12
  gem 'spork', '>= 0.9.2'
13
- gem 'fakefs', '~> 0.4.2'
13
+ gem 'fakefs', '~> 0.4.2', :require => 'fakefs/safe'
14
14
  gem 'wdm', :require => false
15
15
  gem 'guard-rspec', '>=2.1.0'
16
16
  end
data/LICENSE CHANGED
@@ -1,5 +1,4 @@
1
1
  Copyright (c) 2012-2014 Ilya Vorontsov
2
- bioinform utilizes several methods from the `activesupport` gem.
3
2
 
4
3
  MIT License
5
4
 
data/README.md CHANGED
@@ -19,7 +19,7 @@ Or install it yourself as:
19
19
 
20
20
  ## Usage
21
21
 
22
- Usage is under construction. I don't recommend use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
22
+ Usage is under construction. I don't recommend to use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
23
23
 
24
24
  ### Command-line applications
25
25
  * pcm2pwm
data/TODO.txt CHANGED
@@ -1,38 +1,31 @@
1
- ! Make matrices immutable - it will allow more safe interface and better caching
2
-
3
- Collection contain Motif-s, each Motif can contain any of list: pcm,pwm,ppm.
4
- Name, background, tags and any other parameters should be removed from PM class to be placed in Motif
5
-
6
-
7
-
8
- ToDo:
9
- how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
10
- Make specs and fix code in such a way that Parser.split_on_motifs and so on returned consistent result. E.g. Parser.parse! raised an error on multiple times invocation
11
-
12
- refactor CLI::SplitMotifs in place where it splits collection file and choose real data models or makes PM
13
-
14
- Make Collection convenient way to store both pwm and pcm for a single motif (may be both should be in parameters of motif?). Also make methods like sort! that can change collection inner structure without working with @collection-variable directly. For example collection.sort!{|a,b| a.length<=>b.length} (here sort yielded only motif, but now it yields both motif and infos - it's inconvenient)
15
-
16
- Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
17
- Prevent parser going into infinity loop
1
+ сделать работу с ValidationError
2
+ сделать ошибки тэгированными
3
+ обобщить модели фона на разные алфавиты
4
+ парсеры
5
+ подумать про большее число парсеров: transfac etc
6
+ должны ли парсеры быть в библиотеке или снаружи
7
+ не стоит ли парсеры утащить в MotifModel или еще куда-нибудь? У нас ведь еще будут парсеры сиквенсов итп
8
+ форматтеры
9
+ должны ли форматтеры быть в библиотеке или снаружи
10
+ конвертеры
11
+ конвертер мары должен быть вынесен отдельно
12
+ конвертеры хорошо быть отрефакторить (но не ясно как это сделать хорошо)
13
+ посмотреть, совместимы ли конвертеры с идеей разных алфавитов
14
+ включить модели сиквенсов (оптимизировать их: нуклекотид-число; не забыть про разные алфавиты)/снипов/алигнментов/геномных позиций-интервалов
15
+ скоринг IUPAC-сиквенсов сейчас делается при помощи IUPAC-алфавитных матриц (см. конвертер PWM2IupacPWM). Написать хелпы.
16
+ утащить CLI из пакета куда-нибудь
17
+ утащить из bioinform.rb get_pcm
18
+ починить convert_motif
19
+
20
+ PM#equal? и PM#hash
21
+
22
+ ? Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
18
23
 
19
24
  Create CLI-apps:
20
- -- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
21
-
22
- Make Parsers to be switcheable in runtime so that one could parse string composed of two motifs in different formats.
25
+ ? -- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
23
26
 
24
27
  Decide:
25
- -- Whether PPM should have `words_count`/`weight`?
26
- PPM format such that parser got both matrix and count (if PPM have `word_count`)
27
- -- can_parse?
28
28
  -- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
29
- -- behaviour of PM#== for PMs with different tags
30
- -- should background be in PM by default?
31
- -- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
32
- -- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
33
- -- PM#to_pcm and friends have unintuitive behavior. E.g. pm.to_pcm.to_pwm != pm.to_pwm First is matrix treated as pcm and then converted, while second is matrix treated as pwm from start
34
- -- Should parser be reloadable or not? May be delete #reset_scanner?
35
- -- Should Collection has infos for each motif if it already has parameters? (see also discussion above about Collection#sort! and so on)
36
29
 
37
30
  Specs
38
31
  -- PWM#probabilities, #score_variance, #gauss_estimation
data/bin/convert_motif ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/bioinform/cli/convert_motif'
4
+ Bioinform::CLI::ConvertMotif.main(ARGV)
data/bin/pcm2pwm CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative '../lib/bioinform/cli/pcm2pwm'
4
- Bioinform::CLI::PCM2PWM.main(ARGV)
4
+ Bioinform::CLI::PCM2PWM.main(ARGV)
data/bin/split_motifs CHANGED
@@ -1,4 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative '../lib/bioinform/cli/split_motifs'
4
- Bioinform::CLI::SplitMotifs.main(ARGV)
4
+ Bioinform::CLI::SplitMotifs.main(ARGV)
data/bioinform.gemspec CHANGED
@@ -14,6 +14,4 @@ Gem::Specification.new do |gem|
14
14
  gem.name = "bioinform"
15
15
  gem.require_paths = ["lib"]
16
16
  gem.version = Bioinform::VERSION
17
-
18
- gem.add_dependency('docopt', '= 0.5.0')
19
17
  end
data/lib/bioinform.rb CHANGED
@@ -1,37 +1,75 @@
1
1
  require_relative 'bioinform/version'
2
2
  require_relative 'bioinform/support'
3
+ require_relative 'bioinform/errors'
3
4
  require_relative 'bioinform/parsers'
4
- require_relative 'bioinform/formatters'
5
5
  require_relative 'bioinform/data_models'
6
+ require_relative 'bioinform/conversion_algorithms'
7
+ require_relative 'bioinform/formatters'
6
8
  require_relative 'bioinform/cli'
7
9
 
10
+ require_relative 'bioinform/background'
11
+ require_relative 'bioinform/alphabet'
12
+
8
13
  module Bioinform
9
- class Error < StandardError
14
+ def self.get_model(data_model, matrix, name)
15
+ Bioinform::MotifModel.const_get(data_model).new(matrix).named(name)
16
+ end
17
+
18
+ def self.get_model_from_string(data_model, matrix_string)
19
+ motif_infos = MatrixParser.new.parse(matrix_string)
20
+ get_model(data_model, motif_infos.matrix, name)
10
21
  end
11
22
 
12
23
  def self.get_pwm(data_model, matrix, background, pseudocount, effective_count)
13
- pm = Bioinform.const_get(data_model).new(matrix)
14
- pm.set_parameters(background: background)
15
- if pseudocount && ! pseudocount.blank? && [:PCM,:PPM].include?(data_model.to_sym)
16
- pm.set_parameters(pseudocount: pseudocount)
24
+ input_model = get_model_from_string(data_model, matrix)
25
+ case input_model
26
+ when MotifModel::PPM
27
+ ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
28
+ pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
29
+ pcm2pwm_converter.convert(ppm2pcm_converter.convert(input_model))
30
+ when MotifModel::PCM
31
+ pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
32
+ pcm2pwm_converter.convert(input_model)
33
+ when MotifModel::PWM
34
+ input_model
35
+ else
36
+ raise Error, "Unknown input `#{input_model}`"
17
37
  end
18
- if effective_count && [:PPM].include?(data_model.to_sym)
19
- pm.set_parameters(effective_count: effective_count)
20
- end
21
- pm.to_pwm
22
38
  rescue => e
23
- raise "PWM creation failed (#{e})"
39
+ raise Error, "PWM creation failed (#{e})"
24
40
  end
25
41
 
26
42
  def self.get_pcm(data_model, matrix, effective_count)
27
- pm = Bioinform.const_get(data_model).new(matrix)
28
- if effective_count && [:PPM].include?(data_model.to_sym)
29
- pm.set_parameters(effective_count: effective_count)
43
+ input_model = get_model_from_string(data_model, matrix)
44
+ case input_model
45
+ when MotifModel::PPM
46
+ ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
47
+ ppm2pcm_converter.convert(input_model)
48
+ when MotifModel::PCM
49
+ input_model
50
+ when MotifModel::PWM
51
+ raise Error, 'Conversion PWM-->PCM not yet implemented'
52
+ else
53
+ raise Error, "Unknown input `#{input_model}`"
30
54
  end
31
- pm.to_pcm
55
+ rescue => e
56
+ raise Error, "PCM creation failed (#{e})"
32
57
  end
33
58
 
34
59
  def self.get_ppm(data_model, matrix)
35
- Bioinform.const_get(data_model).new(matrix).to_ppm
60
+ input_model = get_model_from_string(data_model, matrix)
61
+ case input_model
62
+ when MotifModel::PPM
63
+ input_model
64
+ when MotifModel::PCM
65
+ pcm2ppm_converter = ConversionAlgorithms::PCM2PPM.new
66
+ pcm2ppm_converter.convert(input_model)
67
+ when MotifModel::PWM
68
+ raise Error, 'Conversion PWM-->PPM not yet implemented'
69
+ else
70
+ raise Error, "Unknown input `#{input_model}`"
71
+ end
72
+ rescue => e
73
+ raise Error, "PPM creation failed (#{e})"
36
74
  end
37
75
  end
@@ -0,0 +1,85 @@
1
+ require_relative 'support'
2
+ require_relative 'errors'
3
+
4
+ module Bioinform
5
+ # alphabets for DNA/RNA (which do have complements)
6
+ class ComplementableAlphabet
7
+ attr_reader :alphabet, :complement_alphabet
8
+
9
+ # ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
10
+ def initialize(alphabet, complements)
11
+ @alphabet = alphabet.map{|letter| letter.upcase.to_sym }
12
+ @complement_alphabet = complements.map{|letter| letter.upcase.to_sym }
13
+
14
+ @complements_by_letters = Support.various_key_value_case_types( Hash[ @alphabet.zip(@complement_alphabet) ] )
15
+
16
+ @index_by_letter = Support.various_key_case_types(Support.element_indices(@alphabet))
17
+ raise Error, "Complement's complement should be original letter" unless valid?
18
+ end
19
+
20
+ def valid?
21
+ non_duplicated_letters = (@alphabet.size == @alphabet.uniq.size)
22
+ compatible_sizes = (@alphabet.size == @complement_alphabet.size)
23
+ invertable_complement = @alphabet.all?{|letter| complement_letter(complement_letter(letter)) == letter }
24
+ non_duplicated_letters && compatible_sizes && invertable_complement
25
+ end
26
+ private :valid?
27
+
28
+ def size
29
+ @alphabet.size
30
+ end
31
+
32
+ def each_letter(&block)
33
+ @alphabet.each(&block)
34
+ end
35
+
36
+ def each_letter_index(&block)
37
+ @alphabet.each_index(&block)
38
+ end
39
+
40
+ def letter_by_index(index)
41
+ @alphabet[index] || raise(Error, "Unknown letter-index #{index}")
42
+ end
43
+
44
+ def index_by_letter(letter)
45
+ @index_by_letter[letter] || raise(Error, "Unknown letter #{letter}")
46
+ end
47
+
48
+ def complement_letter(letter)
49
+ @complements_by_letters[letter] || raise(Error, "Unknown letter #{letter}")
50
+ end
51
+
52
+ def complement_index(index)
53
+ letter = @complement_alphabet[index] || raise(Error, "Unknown letter-index #{index}")
54
+ @index_by_letter[letter]
55
+ end
56
+
57
+ def ==(other)
58
+ @alphabet == other.alphabet && @complement_alphabet == other.complement_alphabet
59
+ end
60
+ end
61
+
62
+
63
+ module IUPAC
64
+ NucleotideIndicesByIUPACLetter = {
65
+ A: [0], C: [1], G: [2], T: [3],
66
+ M: [0, 1], R: [0, 2], W: [0, 3], S: [1, 2], Y: [1, 3], K: [2, 3],
67
+ V: [0, 1, 2], H: [0, 1, 3], D: [0, 2, 3], B: [1, 2, 3],
68
+ N: [0, 1, 2, 3]
69
+ }
70
+ IUPACLettersByNucleotideIndices = Bioinform::Support.with_key_permutations(NucleotideIndicesByIUPACLetter.invert)
71
+
72
+ def self.complement_iupac_letter(iupac_letter)
73
+ nucleotide_indices = NucleotideIndicesByIUPACLetter[iupac_letter]
74
+ complement_nucleotide_indices = nucleotide_indices.map{|nucleotide_index| 3 - nucleotide_index }
75
+ IUPACLettersByNucleotideIndices[complement_nucleotide_indices]
76
+ end
77
+ end
78
+
79
+ iupac_letters = [:A, :C, :G, :T, :M, :R, :W, :S, :Y, :K, :V, :H, :D, :B, :N]
80
+
81
+ NucleotideAlphabet = ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
82
+ NucleotideAlphabetWithN = ComplementableAlphabet.new([:A,:C,:G,:T,:N], [:T,:G,:C,:A,:N])
83
+ IUPACAlphabet = ComplementableAlphabet.new( iupac_letters,
84
+ iupac_letters.map{|letter| IUPAC.complement_iupac_letter(letter) } )
85
+ end
@@ -0,0 +1,90 @@
1
+ require_relative 'errors'
2
+
3
+ # TODO: generalize for the case of different alphabet
4
+ module Bioinform
5
+ # it also tags Frequencies and WordwiseBackground classes so that .is_a?(Bioinform::Background) is true for them
6
+ module Background
7
+ def self.wordwise
8
+ Bioinform::Background::Wordwise
9
+ end
10
+ def self.uniform
11
+ Bioinform::Background::Uniform
12
+ end
13
+
14
+ def self.from_gc_content(gc_content)
15
+ p_at = (1.0 - gc_content) / 2.0;
16
+ p_cg = gc_content / 2.0;
17
+ Frequencies.new([p_at, p_cg, p_cg, p_at])
18
+ end
19
+
20
+ def self.from_string(str)
21
+ return wordwise if str.downcase == 'wordwise'
22
+ return uniform if str.downcase == 'uniform'
23
+ arr = str.strip.split(',').map(&:to_f)
24
+ arr == [1,1,1,1] ? wordwise : Bioinform::Frequencies.new(arr)
25
+ end
26
+ end
27
+
28
+ module FrequencyCalculations
29
+ # sum(values_i * p_i)
30
+ def mean(values)
31
+ 4.times.map{|i| values[i] * frequencies[i] }.inject(0.0, &:+)
32
+ end
33
+ # sum(values_i^2 * p_i)
34
+ def mean_square(values)
35
+ 4.times.map{|i| values[i] * values[i] * frequencies[i] }.inject(0.0, &:+)
36
+ end
37
+
38
+ def symmetric?
39
+ frequencies == frequencies.reverse
40
+ end
41
+ end
42
+
43
+ class Frequencies
44
+ include FrequencyCalculations
45
+ include Bioinform::Background
46
+ def initialize(frequencies)
47
+ @frequencies = frequencies
48
+ raise Error, 'Sum of Background frequencies should be equal to 1' unless (frequencies.inject(0.0, &:+) - 1.0).abs < 1e-4
49
+ end
50
+
51
+ attr_reader :frequencies
52
+ def counts; frequencies; end
53
+ def volume; 1; end
54
+ def wordwise?; false; end
55
+
56
+
57
+ def ==(other)
58
+ self.class == other.class && frequencies == other.frequencies
59
+ end
60
+
61
+ def to_s
62
+ counts.join(',')
63
+ end
64
+ end
65
+
66
+ class WordwiseBackground
67
+ UniformFrequencies = [0.25, 0.25, 0.25, 0.25]
68
+ WordwiseCounts = [1, 1, 1, 1]
69
+ include FrequencyCalculations
70
+ include Bioinform::Background
71
+
72
+ def frequencies; UniformFrequencies; end
73
+ def counts; WordwiseCounts; end
74
+ def volume; 4; end
75
+ def wordwise?; true; end
76
+
77
+ def ==(other)
78
+ self.class == other.class
79
+ end
80
+
81
+ def to_s
82
+ counts.join(',')
83
+ end
84
+ end
85
+
86
+ module Background
87
+ Uniform = Bioinform::Frequencies.new([0.25, 0.25, 0.25, 0.25])
88
+ Wordwise = Bioinform::WordwiseBackground.new
89
+ end
90
+ end
data/lib/bioinform/cli.rb CHANGED
@@ -25,6 +25,5 @@ module Bioinform
25
25
  end
26
26
  end
27
27
 
28
- require_relative 'cli/merge_into_collection'
29
28
  require_relative 'cli/pcm2pwm'
30
- require_relative 'cli/split_motifs'
29
+ require_relative 'cli/split_motifs'
@@ -7,14 +7,14 @@ $logger = Logger.new('convert_motif.log')
7
7
  module Bioinform
8
8
  module CLI
9
9
  class ConvertMotif
10
-
10
+
11
11
  def arguments
12
12
  @arguments ||= []
13
13
  end
14
14
  def options
15
15
  @options ||= {}
16
16
  end
17
-
17
+
18
18
  def main(argv)
19
19
  parse!(argv, filename_format: './{name}.{ext}')
20
20
  motif_files = arguments
@@ -23,28 +23,60 @@ module Bioinform
23
23
  puts option_parser.help()
24
24
  return
25
25
  end
26
-
26
+
27
27
  output_motifs = []
28
28
  motifs = motif_files.map do |filename|
29
+ input = File.read(filename)
30
+ motif_info = MotifParser.new.parse(input)
29
31
  case options[:model_from]
30
32
  when 'pwm'
31
- PWM.new(File.read(filename))
33
+ MotifModel::PWM.new(motif_info[:matrix]).named(motif_info[:name])
32
34
  when 'pcm'
33
- PCM.new(File.read(filename))
35
+ MotifModel::PCM.new(motif_info[:matrix]).named(motif_info[:name])
34
36
  when 'ppm'
35
- PPM.new(File.read(filename))
37
+ MotifModel::PPM.new(motif_info[:matrix]).named(motif_info[:name])
38
+ else
39
+ raise "Unknown value of model-from parameter: `#{options[:model_from]}`"
36
40
  end
37
41
  end
38
-
42
+ pcm2pwm_converter = ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: Background::Uniform)
43
+ pcm2ppm_converter = ConversionAlgorithms::PCM2PPMConverter.new
44
+ ppm2pcm_converter = ConversionAlgorithms::PPM2PCMConverter.new(count: 100)
39
45
  motifs.each do |motif|
40
46
  begin
41
47
  case options[:model_to]
42
48
  when 'pwm'
43
- output_motifs << motif.to_pwm
49
+ if MotifModel.acts_as_pcm?(motif)
50
+ output_motifs << pcm2pwm_converter.convert(motif)
51
+ elsif MotifModel.acts_as_ppm?(motif)
52
+ output_motifs << pcm2pwm_converter.convert(ppm2pcm_converter.convert(motif))
53
+ elsif MotifModel.acts_as_pwm?(motif)
54
+ output_motifs << motif
55
+ else
56
+ raise "Can't be here"
57
+ end
44
58
  when 'pcm'
45
- output_motifs << motif.to_pcm
59
+ if MotifModel.acts_as_pcm?(motif)
60
+ output_motifs << motif
61
+ elsif MotifModel.acts_as_ppm?(motif)
62
+ output_motifs << ppm2pcm_converter.convert(motif)
63
+ elsif MotifModel.acts_as_pwm?(motif)
64
+ raise 'Not yet implemented'
65
+ else
66
+ raise "Can't be here"
67
+ end
46
68
  when 'ppm'
47
- output_motifs << motif.to_ppm
69
+ if MotifModel.acts_as_pcm?(motif)
70
+ output_motifs << pcm2ppm_converter.convert(motif)
71
+ elsif MotifModel.acts_as_ppm?(motif)
72
+ output_motifs << motif
73
+ elsif MotifModel.acts_as_pwm?(motif)
74
+ raise 'Not yet implemented'
75
+ else
76
+ raise "Can't be here"
77
+ end
78
+ else
79
+ raise "Unknown value of model-to parameter: `#{options[:model_to]}`"
48
80
  end
49
81
  rescue
50
82
  $stderr.puts "One can't convert from #{options[:model_from]} data-model to #{options[:model_to]} data-model"
@@ -52,8 +84,10 @@ module Bioinform
52
84
  end
53
85
  end
54
86
  puts output_motifs.join("\n\n")
55
- rescue
87
+ rescue => e
56
88
  $stderr.puts "Error! Conversion wasn't performed"
89
+ $stderr.puts e
90
+ $stderr.puts e.backtrace
57
91
  end
58
92
 
59
93
  def option_parser
@@ -62,13 +96,14 @@ module Bioinform
62
96
  Usage:
63
97
  convert_motif [options] <motif-files>...
64
98
  ls | convert_motif [options]
65
-
99
+
66
100
  convert_motif - tool for converting motifs from different input formats
67
101
  to different output formats.
68
102
  It can change both formatting style and motif models.
69
103
  Resulting model is sent to stdout (this can be overriden with --save option).
70
104
  BANNER
71
-
105
+
106
+ cli.version = ::Bioinform::VERSION
72
107
  cli.summary_indent = ''
73
108
  cli.banner = strip_doc(banner)
74
109
  cli.separator ""
@@ -97,12 +132,12 @@ module Bioinform
97
132
  option_parser.parse!(argv)
98
133
  @arguments = argv
99
134
  end
100
-
101
-
135
+
136
+
102
137
  def self.main(argv)
103
138
  self.new.main(argv)
104
139
  end
105
-
140
+
106
141
  end
107
142
  end
108
- end
143
+ end