bioinform 0.1.17 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 41bb8dd19247a6f1b8e7643e5fbf1d0e03b823de
|
|
4
|
+
data.tar.gz: 7dbd3f01dbea7fe1ed3125bef775cc72e5dccf8e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a3e5c829bf134e07c7a03a56de61017ca9f3c7237e1c6a18e49d0cf57cdf1a3054c7501541070e85a756ae12231a84c6d04a6f94384b5d7dd57f93ceaf335e11
|
|
7
|
+
data.tar.gz: 84d64628b85fd7b5e757637de51c074baeb98ebd8053c83c4af4b77bcf7e843fca10e545b472e7fda781cc9b3fba4ccb9b8bd8055e254b5f7a1f83d1575f672f
|
data/Gemfile
CHANGED
|
@@ -6,11 +6,11 @@ gemspec
|
|
|
6
6
|
|
|
7
7
|
group :development do
|
|
8
8
|
# gem 'win32console'
|
|
9
|
-
gem 'rspec', '
|
|
10
|
-
gem 'fabrication', '~> 2.5.0'
|
|
9
|
+
gem 'rspec', '~> 3.0'
|
|
10
|
+
# gem 'fabrication', '~> 2.5.0'
|
|
11
11
|
gem 'rspec-given', '>= 2.0.0'
|
|
12
12
|
gem 'spork', '>= 0.9.2'
|
|
13
|
-
gem 'fakefs', '~> 0.4.2'
|
|
13
|
+
gem 'fakefs', '~> 0.4.2', :require => 'fakefs/safe'
|
|
14
14
|
gem 'wdm', :require => false
|
|
15
15
|
gem 'guard-rspec', '>=2.1.0'
|
|
16
16
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
|
@@ -19,7 +19,7 @@ Or install it yourself as:
|
|
|
19
19
|
|
|
20
20
|
## Usage
|
|
21
21
|
|
|
22
|
-
Usage is under construction. I don't recommend use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
|
|
22
|
+
Usage is under construction. I don't recommend to use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
|
|
23
23
|
|
|
24
24
|
### Command-line applications
|
|
25
25
|
* pcm2pwm
|
data/TODO.txt
CHANGED
|
@@ -1,38 +1,31 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
1
|
+
сделать работу с ValidationError
|
|
2
|
+
сделать ошибки тэгированными
|
|
3
|
+
обобщить модели фона на разные алфавиты
|
|
4
|
+
парсеры
|
|
5
|
+
подумать про большее число парсеров: transfac etc
|
|
6
|
+
должны ли парсеры быть в библиотеке или снаружи
|
|
7
|
+
не стоит ли парсеры утащить в MotifModel или еще куда-нибудь? У нас ведь еще будут парсеры сиквенсов итп
|
|
8
|
+
форматтеры
|
|
9
|
+
должны ли форматтеры быть в библиотеке или снаружи
|
|
10
|
+
конвертеры
|
|
11
|
+
конвертер мары должен быть вынесен отдельно
|
|
12
|
+
конвертеры хорошо быть отрефакторить (но не ясно как это сделать хорошо)
|
|
13
|
+
посмотреть, совместимы ли конвертеры с идеей разных алфавитов
|
|
14
|
+
включить модели сиквенсов (оптимизировать их: нуклекотид-число; не забыть про разные алфавиты)/снипов/алигнментов/геномных позиций-интервалов
|
|
15
|
+
скоринг IUPAC-сиквенсов сейчас делается при помощи IUPAC-алфавитных матриц (см. конвертер PWM2IupacPWM). Написать хелпы.
|
|
16
|
+
утащить CLI из пакета куда-нибудь
|
|
17
|
+
утащить из bioinform.rb get_pcm
|
|
18
|
+
починить convert_motif
|
|
19
|
+
|
|
20
|
+
PM#equal? и PM#hash
|
|
21
|
+
|
|
22
|
+
? Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
|
|
18
23
|
|
|
19
24
|
Create CLI-apps:
|
|
20
|
-
-- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
|
|
21
|
-
|
|
22
|
-
Make Parsers to be switcheable in runtime so that one could parse string composed of two motifs in different formats.
|
|
25
|
+
? -- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
|
|
23
26
|
|
|
24
27
|
Decide:
|
|
25
|
-
-- Whether PPM should have `words_count`/`weight`?
|
|
26
|
-
PPM format such that parser got both matrix and count (if PPM have `word_count`)
|
|
27
|
-
-- can_parse?
|
|
28
28
|
-- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
|
|
29
|
-
-- behaviour of PM#== for PMs with different tags
|
|
30
|
-
-- should background be in PM by default?
|
|
31
|
-
-- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
|
|
32
|
-
-- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
|
|
33
|
-
-- PM#to_pcm and friends have unintuitive behavior. E.g. pm.to_pcm.to_pwm != pm.to_pwm First is matrix treated as pcm and then converted, while second is matrix treated as pwm from start
|
|
34
|
-
-- Should parser be reloadable or not? May be delete #reset_scanner?
|
|
35
|
-
-- Should Collection has infos for each motif if it already has parameters? (see also discussion above about Collection#sort! and so on)
|
|
36
29
|
|
|
37
30
|
Specs
|
|
38
31
|
-- PWM#probabilities, #score_variance, #gauss_estimation
|
data/bin/convert_motif
ADDED
data/bin/pcm2pwm
CHANGED
data/bin/split_motifs
CHANGED
data/bioinform.gemspec
CHANGED
data/lib/bioinform.rb
CHANGED
|
@@ -1,37 +1,75 @@
|
|
|
1
1
|
require_relative 'bioinform/version'
|
|
2
2
|
require_relative 'bioinform/support'
|
|
3
|
+
require_relative 'bioinform/errors'
|
|
3
4
|
require_relative 'bioinform/parsers'
|
|
4
|
-
require_relative 'bioinform/formatters'
|
|
5
5
|
require_relative 'bioinform/data_models'
|
|
6
|
+
require_relative 'bioinform/conversion_algorithms'
|
|
7
|
+
require_relative 'bioinform/formatters'
|
|
6
8
|
require_relative 'bioinform/cli'
|
|
7
9
|
|
|
10
|
+
require_relative 'bioinform/background'
|
|
11
|
+
require_relative 'bioinform/alphabet'
|
|
12
|
+
|
|
8
13
|
module Bioinform
|
|
9
|
-
|
|
14
|
+
def self.get_model(data_model, matrix, name)
|
|
15
|
+
Bioinform::MotifModel.const_get(data_model).new(matrix).named(name)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.get_model_from_string(data_model, matrix_string)
|
|
19
|
+
motif_infos = MatrixParser.new.parse(matrix_string)
|
|
20
|
+
get_model(data_model, motif_infos.matrix, name)
|
|
10
21
|
end
|
|
11
22
|
|
|
12
23
|
def self.get_pwm(data_model, matrix, background, pseudocount, effective_count)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
24
|
+
input_model = get_model_from_string(data_model, matrix)
|
|
25
|
+
case input_model
|
|
26
|
+
when MotifModel::PPM
|
|
27
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
|
|
28
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
|
|
29
|
+
pcm2pwm_converter.convert(ppm2pcm_converter.convert(input_model))
|
|
30
|
+
when MotifModel::PCM
|
|
31
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
|
|
32
|
+
pcm2pwm_converter.convert(input_model)
|
|
33
|
+
when MotifModel::PWM
|
|
34
|
+
input_model
|
|
35
|
+
else
|
|
36
|
+
raise Error, "Unknown input `#{input_model}`"
|
|
17
37
|
end
|
|
18
|
-
if effective_count && [:PPM].include?(data_model.to_sym)
|
|
19
|
-
pm.set_parameters(effective_count: effective_count)
|
|
20
|
-
end
|
|
21
|
-
pm.to_pwm
|
|
22
38
|
rescue => e
|
|
23
|
-
raise "PWM creation failed (#{e})"
|
|
39
|
+
raise Error, "PWM creation failed (#{e})"
|
|
24
40
|
end
|
|
25
41
|
|
|
26
42
|
def self.get_pcm(data_model, matrix, effective_count)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
43
|
+
input_model = get_model_from_string(data_model, matrix)
|
|
44
|
+
case input_model
|
|
45
|
+
when MotifModel::PPM
|
|
46
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
|
|
47
|
+
ppm2pcm_converter.convert(input_model)
|
|
48
|
+
when MotifModel::PCM
|
|
49
|
+
input_model
|
|
50
|
+
when MotifModel::PWM
|
|
51
|
+
raise Error, 'Conversion PWM-->PCM not yet implemented'
|
|
52
|
+
else
|
|
53
|
+
raise Error, "Unknown input `#{input_model}`"
|
|
30
54
|
end
|
|
31
|
-
|
|
55
|
+
rescue => e
|
|
56
|
+
raise Error, "PCM creation failed (#{e})"
|
|
32
57
|
end
|
|
33
58
|
|
|
34
59
|
def self.get_ppm(data_model, matrix)
|
|
35
|
-
|
|
60
|
+
input_model = get_model_from_string(data_model, matrix)
|
|
61
|
+
case input_model
|
|
62
|
+
when MotifModel::PPM
|
|
63
|
+
input_model
|
|
64
|
+
when MotifModel::PCM
|
|
65
|
+
pcm2ppm_converter = ConversionAlgorithms::PCM2PPM.new
|
|
66
|
+
pcm2ppm_converter.convert(input_model)
|
|
67
|
+
when MotifModel::PWM
|
|
68
|
+
raise Error, 'Conversion PWM-->PPM not yet implemented'
|
|
69
|
+
else
|
|
70
|
+
raise Error, "Unknown input `#{input_model}`"
|
|
71
|
+
end
|
|
72
|
+
rescue => e
|
|
73
|
+
raise Error, "PPM creation failed (#{e})"
|
|
36
74
|
end
|
|
37
75
|
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require_relative 'support'
|
|
2
|
+
require_relative 'errors'
|
|
3
|
+
|
|
4
|
+
module Bioinform
|
|
5
|
+
# alphabets for DNA/RNA (which do have complements)
|
|
6
|
+
class ComplementableAlphabet
|
|
7
|
+
attr_reader :alphabet, :complement_alphabet
|
|
8
|
+
|
|
9
|
+
# ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
|
|
10
|
+
def initialize(alphabet, complements)
|
|
11
|
+
@alphabet = alphabet.map{|letter| letter.upcase.to_sym }
|
|
12
|
+
@complement_alphabet = complements.map{|letter| letter.upcase.to_sym }
|
|
13
|
+
|
|
14
|
+
@complements_by_letters = Support.various_key_value_case_types( Hash[ @alphabet.zip(@complement_alphabet) ] )
|
|
15
|
+
|
|
16
|
+
@index_by_letter = Support.various_key_case_types(Support.element_indices(@alphabet))
|
|
17
|
+
raise Error, "Complement's complement should be original letter" unless valid?
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def valid?
|
|
21
|
+
non_duplicated_letters = (@alphabet.size == @alphabet.uniq.size)
|
|
22
|
+
compatible_sizes = (@alphabet.size == @complement_alphabet.size)
|
|
23
|
+
invertable_complement = @alphabet.all?{|letter| complement_letter(complement_letter(letter)) == letter }
|
|
24
|
+
non_duplicated_letters && compatible_sizes && invertable_complement
|
|
25
|
+
end
|
|
26
|
+
private :valid?
|
|
27
|
+
|
|
28
|
+
def size
|
|
29
|
+
@alphabet.size
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def each_letter(&block)
|
|
33
|
+
@alphabet.each(&block)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def each_letter_index(&block)
|
|
37
|
+
@alphabet.each_index(&block)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def letter_by_index(index)
|
|
41
|
+
@alphabet[index] || raise(Error, "Unknown letter-index #{index}")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def index_by_letter(letter)
|
|
45
|
+
@index_by_letter[letter] || raise(Error, "Unknown letter #{letter}")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def complement_letter(letter)
|
|
49
|
+
@complements_by_letters[letter] || raise(Error, "Unknown letter #{letter}")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def complement_index(index)
|
|
53
|
+
letter = @complement_alphabet[index] || raise(Error, "Unknown letter-index #{index}")
|
|
54
|
+
@index_by_letter[letter]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def ==(other)
|
|
58
|
+
@alphabet == other.alphabet && @complement_alphabet == other.complement_alphabet
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
module IUPAC
|
|
64
|
+
NucleotideIndicesByIUPACLetter = {
|
|
65
|
+
A: [0], C: [1], G: [2], T: [3],
|
|
66
|
+
M: [0, 1], R: [0, 2], W: [0, 3], S: [1, 2], Y: [1, 3], K: [2, 3],
|
|
67
|
+
V: [0, 1, 2], H: [0, 1, 3], D: [0, 2, 3], B: [1, 2, 3],
|
|
68
|
+
N: [0, 1, 2, 3]
|
|
69
|
+
}
|
|
70
|
+
IUPACLettersByNucleotideIndices = Bioinform::Support.with_key_permutations(NucleotideIndicesByIUPACLetter.invert)
|
|
71
|
+
|
|
72
|
+
def self.complement_iupac_letter(iupac_letter)
|
|
73
|
+
nucleotide_indices = NucleotideIndicesByIUPACLetter[iupac_letter]
|
|
74
|
+
complement_nucleotide_indices = nucleotide_indices.map{|nucleotide_index| 3 - nucleotide_index }
|
|
75
|
+
IUPACLettersByNucleotideIndices[complement_nucleotide_indices]
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
iupac_letters = [:A, :C, :G, :T, :M, :R, :W, :S, :Y, :K, :V, :H, :D, :B, :N]
|
|
80
|
+
|
|
81
|
+
NucleotideAlphabet = ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
|
|
82
|
+
NucleotideAlphabetWithN = ComplementableAlphabet.new([:A,:C,:G,:T,:N], [:T,:G,:C,:A,:N])
|
|
83
|
+
IUPACAlphabet = ComplementableAlphabet.new( iupac_letters,
|
|
84
|
+
iupac_letters.map{|letter| IUPAC.complement_iupac_letter(letter) } )
|
|
85
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
require_relative 'errors'
|
|
2
|
+
|
|
3
|
+
# TODO: generalize for the case of different alphabet
|
|
4
|
+
module Bioinform
|
|
5
|
+
# it also tags Frequencies and WordwiseBackground classes so that .is_a?(Bioinform::Background) is true for them
|
|
6
|
+
module Background
|
|
7
|
+
def self.wordwise
|
|
8
|
+
Bioinform::Background::Wordwise
|
|
9
|
+
end
|
|
10
|
+
def self.uniform
|
|
11
|
+
Bioinform::Background::Uniform
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.from_gc_content(gc_content)
|
|
15
|
+
p_at = (1.0 - gc_content) / 2.0;
|
|
16
|
+
p_cg = gc_content / 2.0;
|
|
17
|
+
Frequencies.new([p_at, p_cg, p_cg, p_at])
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.from_string(str)
|
|
21
|
+
return wordwise if str.downcase == 'wordwise'
|
|
22
|
+
return uniform if str.downcase == 'uniform'
|
|
23
|
+
arr = str.strip.split(',').map(&:to_f)
|
|
24
|
+
arr == [1,1,1,1] ? wordwise : Bioinform::Frequencies.new(arr)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
module FrequencyCalculations
|
|
29
|
+
# sum(values_i * p_i)
|
|
30
|
+
def mean(values)
|
|
31
|
+
4.times.map{|i| values[i] * frequencies[i] }.inject(0.0, &:+)
|
|
32
|
+
end
|
|
33
|
+
# sum(values_i^2 * p_i)
|
|
34
|
+
def mean_square(values)
|
|
35
|
+
4.times.map{|i| values[i] * values[i] * frequencies[i] }.inject(0.0, &:+)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def symmetric?
|
|
39
|
+
frequencies == frequencies.reverse
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
class Frequencies
|
|
44
|
+
include FrequencyCalculations
|
|
45
|
+
include Bioinform::Background
|
|
46
|
+
def initialize(frequencies)
|
|
47
|
+
@frequencies = frequencies
|
|
48
|
+
raise Error, 'Sum of Background frequencies should be equal to 1' unless (frequencies.inject(0.0, &:+) - 1.0).abs < 1e-4
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
attr_reader :frequencies
|
|
52
|
+
def counts; frequencies; end
|
|
53
|
+
def volume; 1; end
|
|
54
|
+
def wordwise?; false; end
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def ==(other)
|
|
58
|
+
self.class == other.class && frequencies == other.frequencies
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def to_s
|
|
62
|
+
counts.join(',')
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
class WordwiseBackground
|
|
67
|
+
UniformFrequencies = [0.25, 0.25, 0.25, 0.25]
|
|
68
|
+
WordwiseCounts = [1, 1, 1, 1]
|
|
69
|
+
include FrequencyCalculations
|
|
70
|
+
include Bioinform::Background
|
|
71
|
+
|
|
72
|
+
def frequencies; UniformFrequencies; end
|
|
73
|
+
def counts; WordwiseCounts; end
|
|
74
|
+
def volume; 4; end
|
|
75
|
+
def wordwise?; true; end
|
|
76
|
+
|
|
77
|
+
def ==(other)
|
|
78
|
+
self.class == other.class
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def to_s
|
|
82
|
+
counts.join(',')
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
module Background
|
|
87
|
+
Uniform = Bioinform::Frequencies.new([0.25, 0.25, 0.25, 0.25])
|
|
88
|
+
Wordwise = Bioinform::WordwiseBackground.new
|
|
89
|
+
end
|
|
90
|
+
end
|
data/lib/bioinform/cli.rb
CHANGED
|
@@ -7,14 +7,14 @@ $logger = Logger.new('convert_motif.log')
|
|
|
7
7
|
module Bioinform
|
|
8
8
|
module CLI
|
|
9
9
|
class ConvertMotif
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
def arguments
|
|
12
12
|
@arguments ||= []
|
|
13
13
|
end
|
|
14
14
|
def options
|
|
15
15
|
@options ||= {}
|
|
16
16
|
end
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
def main(argv)
|
|
19
19
|
parse!(argv, filename_format: './{name}.{ext}')
|
|
20
20
|
motif_files = arguments
|
|
@@ -23,28 +23,60 @@ module Bioinform
|
|
|
23
23
|
puts option_parser.help()
|
|
24
24
|
return
|
|
25
25
|
end
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
output_motifs = []
|
|
28
28
|
motifs = motif_files.map do |filename|
|
|
29
|
+
input = File.read(filename)
|
|
30
|
+
motif_info = MotifParser.new.parse(input)
|
|
29
31
|
case options[:model_from]
|
|
30
32
|
when 'pwm'
|
|
31
|
-
PWM.new(
|
|
33
|
+
MotifModel::PWM.new(motif_info[:matrix]).named(motif_info[:name])
|
|
32
34
|
when 'pcm'
|
|
33
|
-
PCM.new(
|
|
35
|
+
MotifModel::PCM.new(motif_info[:matrix]).named(motif_info[:name])
|
|
34
36
|
when 'ppm'
|
|
35
|
-
PPM.new(
|
|
37
|
+
MotifModel::PPM.new(motif_info[:matrix]).named(motif_info[:name])
|
|
38
|
+
else
|
|
39
|
+
raise "Unknown value of model-from parameter: `#{options[:model_from]}`"
|
|
36
40
|
end
|
|
37
41
|
end
|
|
38
|
-
|
|
42
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: Background::Uniform)
|
|
43
|
+
pcm2ppm_converter = ConversionAlgorithms::PCM2PPMConverter.new
|
|
44
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCMConverter.new(count: 100)
|
|
39
45
|
motifs.each do |motif|
|
|
40
46
|
begin
|
|
41
47
|
case options[:model_to]
|
|
42
48
|
when 'pwm'
|
|
43
|
-
|
|
49
|
+
if MotifModel.acts_as_pcm?(motif)
|
|
50
|
+
output_motifs << pcm2pwm_converter.convert(motif)
|
|
51
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
|
52
|
+
output_motifs << pcm2pwm_converter.convert(ppm2pcm_converter.convert(motif))
|
|
53
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
|
54
|
+
output_motifs << motif
|
|
55
|
+
else
|
|
56
|
+
raise "Can't be here"
|
|
57
|
+
end
|
|
44
58
|
when 'pcm'
|
|
45
|
-
|
|
59
|
+
if MotifModel.acts_as_pcm?(motif)
|
|
60
|
+
output_motifs << motif
|
|
61
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
|
62
|
+
output_motifs << ppm2pcm_converter.convert(motif)
|
|
63
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
|
64
|
+
raise 'Not yet implemented'
|
|
65
|
+
else
|
|
66
|
+
raise "Can't be here"
|
|
67
|
+
end
|
|
46
68
|
when 'ppm'
|
|
47
|
-
|
|
69
|
+
if MotifModel.acts_as_pcm?(motif)
|
|
70
|
+
output_motifs << pcm2ppm_converter.convert(motif)
|
|
71
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
|
72
|
+
output_motifs << motif
|
|
73
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
|
74
|
+
raise 'Not yet implemented'
|
|
75
|
+
else
|
|
76
|
+
raise "Can't be here"
|
|
77
|
+
end
|
|
78
|
+
else
|
|
79
|
+
raise "Unknown value of model-to parameter: `#{options[:model_to]}`"
|
|
48
80
|
end
|
|
49
81
|
rescue
|
|
50
82
|
$stderr.puts "One can't convert from #{options[:model_from]} data-model to #{options[:model_to]} data-model"
|
|
@@ -52,8 +84,10 @@ module Bioinform
|
|
|
52
84
|
end
|
|
53
85
|
end
|
|
54
86
|
puts output_motifs.join("\n\n")
|
|
55
|
-
rescue
|
|
87
|
+
rescue => e
|
|
56
88
|
$stderr.puts "Error! Conversion wasn't performed"
|
|
89
|
+
$stderr.puts e
|
|
90
|
+
$stderr.puts e.backtrace
|
|
57
91
|
end
|
|
58
92
|
|
|
59
93
|
def option_parser
|
|
@@ -62,13 +96,14 @@ module Bioinform
|
|
|
62
96
|
Usage:
|
|
63
97
|
convert_motif [options] <motif-files>...
|
|
64
98
|
ls | convert_motif [options]
|
|
65
|
-
|
|
99
|
+
|
|
66
100
|
convert_motif - tool for converting motifs from different input formats
|
|
67
101
|
to different output formats.
|
|
68
102
|
It can change both formatting style and motif models.
|
|
69
103
|
Resulting model is sent to stdout (this can be overriden with --save option).
|
|
70
104
|
BANNER
|
|
71
|
-
|
|
105
|
+
|
|
106
|
+
cli.version = ::Bioinform::VERSION
|
|
72
107
|
cli.summary_indent = ''
|
|
73
108
|
cli.banner = strip_doc(banner)
|
|
74
109
|
cli.separator ""
|
|
@@ -97,12 +132,12 @@ module Bioinform
|
|
|
97
132
|
option_parser.parse!(argv)
|
|
98
133
|
@arguments = argv
|
|
99
134
|
end
|
|
100
|
-
|
|
101
|
-
|
|
135
|
+
|
|
136
|
+
|
|
102
137
|
def self.main(argv)
|
|
103
138
|
self.new.main(argv)
|
|
104
139
|
end
|
|
105
|
-
|
|
140
|
+
|
|
106
141
|
end
|
|
107
142
|
end
|
|
108
|
-
end
|
|
143
|
+
end
|