bioinform 0.1.17 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41bb8dd19247a6f1b8e7643e5fbf1d0e03b823de
|
4
|
+
data.tar.gz: 7dbd3f01dbea7fe1ed3125bef775cc72e5dccf8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3e5c829bf134e07c7a03a56de61017ca9f3c7237e1c6a18e49d0cf57cdf1a3054c7501541070e85a756ae12231a84c6d04a6f94384b5d7dd57f93ceaf335e11
|
7
|
+
data.tar.gz: 84d64628b85fd7b5e757637de51c074baeb98ebd8053c83c4af4b77bcf7e843fca10e545b472e7fda781cc9b3fba4ccb9b8bd8055e254b5f7a1f83d1575f672f
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ gemspec
|
|
6
6
|
|
7
7
|
group :development do
|
8
8
|
# gem 'win32console'
|
9
|
-
gem 'rspec', '
|
10
|
-
gem 'fabrication', '~> 2.5.0'
|
9
|
+
gem 'rspec', '~> 3.0'
|
10
|
+
# gem 'fabrication', '~> 2.5.0'
|
11
11
|
gem 'rspec-given', '>= 2.0.0'
|
12
12
|
gem 'spork', '>= 0.9.2'
|
13
|
-
gem 'fakefs', '~> 0.4.2'
|
13
|
+
gem 'fakefs', '~> 0.4.2', :require => 'fakefs/safe'
|
14
14
|
gem 'wdm', :require => false
|
15
15
|
gem 'guard-rspec', '>=2.1.0'
|
16
16
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -19,7 +19,7 @@ Or install it yourself as:
|
|
19
19
|
|
20
20
|
## Usage
|
21
21
|
|
22
|
-
Usage is under construction. I don't recommend use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
|
22
|
+
Usage is under construction. I don't recommend to use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
|
23
23
|
|
24
24
|
### Command-line applications
|
25
25
|
* pcm2pwm
|
data/TODO.txt
CHANGED
@@ -1,38 +1,31 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
1
|
+
сделать работу с ValidationError
|
2
|
+
сделать ошибки тэгированными
|
3
|
+
обобщить модели фона на разные алфавиты
|
4
|
+
парсеры
|
5
|
+
подумать про большее число парсеров: transfac etc
|
6
|
+
должны ли парсеры быть в библиотеке или снаружи
|
7
|
+
не стоит ли парсеры утащить в MotifModel или еще куда-нибудь? У нас ведь еще будут парсеры сиквенсов итп
|
8
|
+
форматтеры
|
9
|
+
должны ли форматтеры быть в библиотеке или снаружи
|
10
|
+
конвертеры
|
11
|
+
конвертер мары должен быть вынесен отдельно
|
12
|
+
конвертеры хорошо быть отрефакторить (но не ясно как это сделать хорошо)
|
13
|
+
посмотреть, совместимы ли конвертеры с идеей разных алфавитов
|
14
|
+
включить модели сиквенсов (оптимизировать их: нуклекотид-число; не забыть про разные алфавиты)/снипов/алигнментов/геномных позиций-интервалов
|
15
|
+
скоринг IUPAC-сиквенсов сейчас делается при помощи IUPAC-алфавитных матриц (см. конвертер PWM2IupacPWM). Написать хелпы.
|
16
|
+
утащить CLI из пакета куда-нибудь
|
17
|
+
утащить из bioinform.rb get_pcm
|
18
|
+
починить convert_motif
|
19
|
+
|
20
|
+
PM#equal? и PM#hash
|
21
|
+
|
22
|
+
? Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
|
18
23
|
|
19
24
|
Create CLI-apps:
|
20
|
-
-- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
|
21
|
-
|
22
|
-
Make Parsers to be switcheable in runtime so that one could parse string composed of two motifs in different formats.
|
25
|
+
? -- to merge many files(or whole folder) to a Collection (in a way that makes able to give collection a name)
|
23
26
|
|
24
27
|
Decide:
|
25
|
-
-- Whether PPM should have `words_count`/`weight`?
|
26
|
-
PPM format such that parser got both matrix and count (if PPM have `word_count`)
|
27
|
-
-- can_parse?
|
28
28
|
-- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
|
29
|
-
-- behaviour of PM#== for PMs with different tags
|
30
|
-
-- should background be in PM by default?
|
31
|
-
-- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
|
32
|
-
-- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
|
33
|
-
-- PM#to_pcm and friends have unintuitive behavior. E.g. pm.to_pcm.to_pwm != pm.to_pwm First is matrix treated as pcm and then converted, while second is matrix treated as pwm from start
|
34
|
-
-- Should parser be reloadable or not? May be delete #reset_scanner?
|
35
|
-
-- Should Collection has infos for each motif if it already has parameters? (see also discussion above about Collection#sort! and so on)
|
36
29
|
|
37
30
|
Specs
|
38
31
|
-- PWM#probabilities, #score_variance, #gauss_estimation
|
data/bin/convert_motif
ADDED
data/bin/pcm2pwm
CHANGED
data/bin/split_motifs
CHANGED
data/bioinform.gemspec
CHANGED
data/lib/bioinform.rb
CHANGED
@@ -1,37 +1,75 @@
|
|
1
1
|
require_relative 'bioinform/version'
|
2
2
|
require_relative 'bioinform/support'
|
3
|
+
require_relative 'bioinform/errors'
|
3
4
|
require_relative 'bioinform/parsers'
|
4
|
-
require_relative 'bioinform/formatters'
|
5
5
|
require_relative 'bioinform/data_models'
|
6
|
+
require_relative 'bioinform/conversion_algorithms'
|
7
|
+
require_relative 'bioinform/formatters'
|
6
8
|
require_relative 'bioinform/cli'
|
7
9
|
|
10
|
+
require_relative 'bioinform/background'
|
11
|
+
require_relative 'bioinform/alphabet'
|
12
|
+
|
8
13
|
module Bioinform
|
9
|
-
|
14
|
+
def self.get_model(data_model, matrix, name)
|
15
|
+
Bioinform::MotifModel.const_get(data_model).new(matrix).named(name)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.get_model_from_string(data_model, matrix_string)
|
19
|
+
motif_infos = MatrixParser.new.parse(matrix_string)
|
20
|
+
get_model(data_model, motif_infos.matrix, name)
|
10
21
|
end
|
11
22
|
|
12
23
|
def self.get_pwm(data_model, matrix, background, pseudocount, effective_count)
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
24
|
+
input_model = get_model_from_string(data_model, matrix)
|
25
|
+
case input_model
|
26
|
+
when MotifModel::PPM
|
27
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
|
28
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
|
29
|
+
pcm2pwm_converter.convert(ppm2pcm_converter.convert(input_model))
|
30
|
+
when MotifModel::PCM
|
31
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWM.new(background: background, pseudocount: pseudocount)
|
32
|
+
pcm2pwm_converter.convert(input_model)
|
33
|
+
when MotifModel::PWM
|
34
|
+
input_model
|
35
|
+
else
|
36
|
+
raise Error, "Unknown input `#{input_model}`"
|
17
37
|
end
|
18
|
-
if effective_count && [:PPM].include?(data_model.to_sym)
|
19
|
-
pm.set_parameters(effective_count: effective_count)
|
20
|
-
end
|
21
|
-
pm.to_pwm
|
22
38
|
rescue => e
|
23
|
-
raise "PWM creation failed (#{e})"
|
39
|
+
raise Error, "PWM creation failed (#{e})"
|
24
40
|
end
|
25
41
|
|
26
42
|
def self.get_pcm(data_model, matrix, effective_count)
|
27
|
-
|
28
|
-
|
29
|
-
|
43
|
+
input_model = get_model_from_string(data_model, matrix)
|
44
|
+
case input_model
|
45
|
+
when MotifModel::PPM
|
46
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCM.new(count: effective_count)
|
47
|
+
ppm2pcm_converter.convert(input_model)
|
48
|
+
when MotifModel::PCM
|
49
|
+
input_model
|
50
|
+
when MotifModel::PWM
|
51
|
+
raise Error, 'Conversion PWM-->PCM not yet implemented'
|
52
|
+
else
|
53
|
+
raise Error, "Unknown input `#{input_model}`"
|
30
54
|
end
|
31
|
-
|
55
|
+
rescue => e
|
56
|
+
raise Error, "PCM creation failed (#{e})"
|
32
57
|
end
|
33
58
|
|
34
59
|
def self.get_ppm(data_model, matrix)
|
35
|
-
|
60
|
+
input_model = get_model_from_string(data_model, matrix)
|
61
|
+
case input_model
|
62
|
+
when MotifModel::PPM
|
63
|
+
input_model
|
64
|
+
when MotifModel::PCM
|
65
|
+
pcm2ppm_converter = ConversionAlgorithms::PCM2PPM.new
|
66
|
+
pcm2ppm_converter.convert(input_model)
|
67
|
+
when MotifModel::PWM
|
68
|
+
raise Error, 'Conversion PWM-->PPM not yet implemented'
|
69
|
+
else
|
70
|
+
raise Error, "Unknown input `#{input_model}`"
|
71
|
+
end
|
72
|
+
rescue => e
|
73
|
+
raise Error, "PPM creation failed (#{e})"
|
36
74
|
end
|
37
75
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require_relative 'support'
|
2
|
+
require_relative 'errors'
|
3
|
+
|
4
|
+
module Bioinform
|
5
|
+
# alphabets for DNA/RNA (which do have complements)
|
6
|
+
class ComplementableAlphabet
|
7
|
+
attr_reader :alphabet, :complement_alphabet
|
8
|
+
|
9
|
+
# ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
|
10
|
+
def initialize(alphabet, complements)
|
11
|
+
@alphabet = alphabet.map{|letter| letter.upcase.to_sym }
|
12
|
+
@complement_alphabet = complements.map{|letter| letter.upcase.to_sym }
|
13
|
+
|
14
|
+
@complements_by_letters = Support.various_key_value_case_types( Hash[ @alphabet.zip(@complement_alphabet) ] )
|
15
|
+
|
16
|
+
@index_by_letter = Support.various_key_case_types(Support.element_indices(@alphabet))
|
17
|
+
raise Error, "Complement's complement should be original letter" unless valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def valid?
|
21
|
+
non_duplicated_letters = (@alphabet.size == @alphabet.uniq.size)
|
22
|
+
compatible_sizes = (@alphabet.size == @complement_alphabet.size)
|
23
|
+
invertable_complement = @alphabet.all?{|letter| complement_letter(complement_letter(letter)) == letter }
|
24
|
+
non_duplicated_letters && compatible_sizes && invertable_complement
|
25
|
+
end
|
26
|
+
private :valid?
|
27
|
+
|
28
|
+
def size
|
29
|
+
@alphabet.size
|
30
|
+
end
|
31
|
+
|
32
|
+
def each_letter(&block)
|
33
|
+
@alphabet.each(&block)
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_letter_index(&block)
|
37
|
+
@alphabet.each_index(&block)
|
38
|
+
end
|
39
|
+
|
40
|
+
def letter_by_index(index)
|
41
|
+
@alphabet[index] || raise(Error, "Unknown letter-index #{index}")
|
42
|
+
end
|
43
|
+
|
44
|
+
def index_by_letter(letter)
|
45
|
+
@index_by_letter[letter] || raise(Error, "Unknown letter #{letter}")
|
46
|
+
end
|
47
|
+
|
48
|
+
def complement_letter(letter)
|
49
|
+
@complements_by_letters[letter] || raise(Error, "Unknown letter #{letter}")
|
50
|
+
end
|
51
|
+
|
52
|
+
def complement_index(index)
|
53
|
+
letter = @complement_alphabet[index] || raise(Error, "Unknown letter-index #{index}")
|
54
|
+
@index_by_letter[letter]
|
55
|
+
end
|
56
|
+
|
57
|
+
def ==(other)
|
58
|
+
@alphabet == other.alphabet && @complement_alphabet == other.complement_alphabet
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
module IUPAC
|
64
|
+
NucleotideIndicesByIUPACLetter = {
|
65
|
+
A: [0], C: [1], G: [2], T: [3],
|
66
|
+
M: [0, 1], R: [0, 2], W: [0, 3], S: [1, 2], Y: [1, 3], K: [2, 3],
|
67
|
+
V: [0, 1, 2], H: [0, 1, 3], D: [0, 2, 3], B: [1, 2, 3],
|
68
|
+
N: [0, 1, 2, 3]
|
69
|
+
}
|
70
|
+
IUPACLettersByNucleotideIndices = Bioinform::Support.with_key_permutations(NucleotideIndicesByIUPACLetter.invert)
|
71
|
+
|
72
|
+
def self.complement_iupac_letter(iupac_letter)
|
73
|
+
nucleotide_indices = NucleotideIndicesByIUPACLetter[iupac_letter]
|
74
|
+
complement_nucleotide_indices = nucleotide_indices.map{|nucleotide_index| 3 - nucleotide_index }
|
75
|
+
IUPACLettersByNucleotideIndices[complement_nucleotide_indices]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
iupac_letters = [:A, :C, :G, :T, :M, :R, :W, :S, :Y, :K, :V, :H, :D, :B, :N]
|
80
|
+
|
81
|
+
NucleotideAlphabet = ComplementableAlphabet.new([:A,:C,:G,:T], [:T,:G,:C,:A])
|
82
|
+
NucleotideAlphabetWithN = ComplementableAlphabet.new([:A,:C,:G,:T,:N], [:T,:G,:C,:A,:N])
|
83
|
+
IUPACAlphabet = ComplementableAlphabet.new( iupac_letters,
|
84
|
+
iupac_letters.map{|letter| IUPAC.complement_iupac_letter(letter) } )
|
85
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require_relative 'errors'
|
2
|
+
|
3
|
+
# TODO: generalize for the case of different alphabet
|
4
|
+
module Bioinform
|
5
|
+
# it also tags Frequencies and WordwiseBackground classes so that .is_a?(Bioinform::Background) is true for them
|
6
|
+
module Background
|
7
|
+
def self.wordwise
|
8
|
+
Bioinform::Background::Wordwise
|
9
|
+
end
|
10
|
+
def self.uniform
|
11
|
+
Bioinform::Background::Uniform
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.from_gc_content(gc_content)
|
15
|
+
p_at = (1.0 - gc_content) / 2.0;
|
16
|
+
p_cg = gc_content / 2.0;
|
17
|
+
Frequencies.new([p_at, p_cg, p_cg, p_at])
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.from_string(str)
|
21
|
+
return wordwise if str.downcase == 'wordwise'
|
22
|
+
return uniform if str.downcase == 'uniform'
|
23
|
+
arr = str.strip.split(',').map(&:to_f)
|
24
|
+
arr == [1,1,1,1] ? wordwise : Bioinform::Frequencies.new(arr)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
module FrequencyCalculations
|
29
|
+
# sum(values_i * p_i)
|
30
|
+
def mean(values)
|
31
|
+
4.times.map{|i| values[i] * frequencies[i] }.inject(0.0, &:+)
|
32
|
+
end
|
33
|
+
# sum(values_i^2 * p_i)
|
34
|
+
def mean_square(values)
|
35
|
+
4.times.map{|i| values[i] * values[i] * frequencies[i] }.inject(0.0, &:+)
|
36
|
+
end
|
37
|
+
|
38
|
+
def symmetric?
|
39
|
+
frequencies == frequencies.reverse
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Frequencies
|
44
|
+
include FrequencyCalculations
|
45
|
+
include Bioinform::Background
|
46
|
+
def initialize(frequencies)
|
47
|
+
@frequencies = frequencies
|
48
|
+
raise Error, 'Sum of Background frequencies should be equal to 1' unless (frequencies.inject(0.0, &:+) - 1.0).abs < 1e-4
|
49
|
+
end
|
50
|
+
|
51
|
+
attr_reader :frequencies
|
52
|
+
def counts; frequencies; end
|
53
|
+
def volume; 1; end
|
54
|
+
def wordwise?; false; end
|
55
|
+
|
56
|
+
|
57
|
+
def ==(other)
|
58
|
+
self.class == other.class && frequencies == other.frequencies
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_s
|
62
|
+
counts.join(',')
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class WordwiseBackground
|
67
|
+
UniformFrequencies = [0.25, 0.25, 0.25, 0.25]
|
68
|
+
WordwiseCounts = [1, 1, 1, 1]
|
69
|
+
include FrequencyCalculations
|
70
|
+
include Bioinform::Background
|
71
|
+
|
72
|
+
def frequencies; UniformFrequencies; end
|
73
|
+
def counts; WordwiseCounts; end
|
74
|
+
def volume; 4; end
|
75
|
+
def wordwise?; true; end
|
76
|
+
|
77
|
+
def ==(other)
|
78
|
+
self.class == other.class
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_s
|
82
|
+
counts.join(',')
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
module Background
|
87
|
+
Uniform = Bioinform::Frequencies.new([0.25, 0.25, 0.25, 0.25])
|
88
|
+
Wordwise = Bioinform::WordwiseBackground.new
|
89
|
+
end
|
90
|
+
end
|
data/lib/bioinform/cli.rb
CHANGED
@@ -7,14 +7,14 @@ $logger = Logger.new('convert_motif.log')
|
|
7
7
|
module Bioinform
|
8
8
|
module CLI
|
9
9
|
class ConvertMotif
|
10
|
-
|
10
|
+
|
11
11
|
def arguments
|
12
12
|
@arguments ||= []
|
13
13
|
end
|
14
14
|
def options
|
15
15
|
@options ||= {}
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def main(argv)
|
19
19
|
parse!(argv, filename_format: './{name}.{ext}')
|
20
20
|
motif_files = arguments
|
@@ -23,28 +23,60 @@ module Bioinform
|
|
23
23
|
puts option_parser.help()
|
24
24
|
return
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
output_motifs = []
|
28
28
|
motifs = motif_files.map do |filename|
|
29
|
+
input = File.read(filename)
|
30
|
+
motif_info = MotifParser.new.parse(input)
|
29
31
|
case options[:model_from]
|
30
32
|
when 'pwm'
|
31
|
-
PWM.new(
|
33
|
+
MotifModel::PWM.new(motif_info[:matrix]).named(motif_info[:name])
|
32
34
|
when 'pcm'
|
33
|
-
PCM.new(
|
35
|
+
MotifModel::PCM.new(motif_info[:matrix]).named(motif_info[:name])
|
34
36
|
when 'ppm'
|
35
|
-
PPM.new(
|
37
|
+
MotifModel::PPM.new(motif_info[:matrix]).named(motif_info[:name])
|
38
|
+
else
|
39
|
+
raise "Unknown value of model-from parameter: `#{options[:model_from]}`"
|
36
40
|
end
|
37
41
|
end
|
38
|
-
|
42
|
+
pcm2pwm_converter = ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: Background::Uniform)
|
43
|
+
pcm2ppm_converter = ConversionAlgorithms::PCM2PPMConverter.new
|
44
|
+
ppm2pcm_converter = ConversionAlgorithms::PPM2PCMConverter.new(count: 100)
|
39
45
|
motifs.each do |motif|
|
40
46
|
begin
|
41
47
|
case options[:model_to]
|
42
48
|
when 'pwm'
|
43
|
-
|
49
|
+
if MotifModel.acts_as_pcm?(motif)
|
50
|
+
output_motifs << pcm2pwm_converter.convert(motif)
|
51
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
52
|
+
output_motifs << pcm2pwm_converter.convert(ppm2pcm_converter.convert(motif))
|
53
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
54
|
+
output_motifs << motif
|
55
|
+
else
|
56
|
+
raise "Can't be here"
|
57
|
+
end
|
44
58
|
when 'pcm'
|
45
|
-
|
59
|
+
if MotifModel.acts_as_pcm?(motif)
|
60
|
+
output_motifs << motif
|
61
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
62
|
+
output_motifs << ppm2pcm_converter.convert(motif)
|
63
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
64
|
+
raise 'Not yet implemented'
|
65
|
+
else
|
66
|
+
raise "Can't be here"
|
67
|
+
end
|
46
68
|
when 'ppm'
|
47
|
-
|
69
|
+
if MotifModel.acts_as_pcm?(motif)
|
70
|
+
output_motifs << pcm2ppm_converter.convert(motif)
|
71
|
+
elsif MotifModel.acts_as_ppm?(motif)
|
72
|
+
output_motifs << motif
|
73
|
+
elsif MotifModel.acts_as_pwm?(motif)
|
74
|
+
raise 'Not yet implemented'
|
75
|
+
else
|
76
|
+
raise "Can't be here"
|
77
|
+
end
|
78
|
+
else
|
79
|
+
raise "Unknown value of model-to parameter: `#{options[:model_to]}`"
|
48
80
|
end
|
49
81
|
rescue
|
50
82
|
$stderr.puts "One can't convert from #{options[:model_from]} data-model to #{options[:model_to]} data-model"
|
@@ -52,8 +84,10 @@ module Bioinform
|
|
52
84
|
end
|
53
85
|
end
|
54
86
|
puts output_motifs.join("\n\n")
|
55
|
-
rescue
|
87
|
+
rescue => e
|
56
88
|
$stderr.puts "Error! Conversion wasn't performed"
|
89
|
+
$stderr.puts e
|
90
|
+
$stderr.puts e.backtrace
|
57
91
|
end
|
58
92
|
|
59
93
|
def option_parser
|
@@ -62,13 +96,14 @@ module Bioinform
|
|
62
96
|
Usage:
|
63
97
|
convert_motif [options] <motif-files>...
|
64
98
|
ls | convert_motif [options]
|
65
|
-
|
99
|
+
|
66
100
|
convert_motif - tool for converting motifs from different input formats
|
67
101
|
to different output formats.
|
68
102
|
It can change both formatting style and motif models.
|
69
103
|
Resulting model is sent to stdout (this can be overriden with --save option).
|
70
104
|
BANNER
|
71
|
-
|
105
|
+
|
106
|
+
cli.version = ::Bioinform::VERSION
|
72
107
|
cli.summary_indent = ''
|
73
108
|
cli.banner = strip_doc(banner)
|
74
109
|
cli.separator ""
|
@@ -97,12 +132,12 @@ module Bioinform
|
|
97
132
|
option_parser.parse!(argv)
|
98
133
|
@arguments = argv
|
99
134
|
end
|
100
|
-
|
101
|
-
|
135
|
+
|
136
|
+
|
102
137
|
def self.main(argv)
|
103
138
|
self.new.main(argv)
|
104
139
|
end
|
105
|
-
|
140
|
+
|
106
141
|
end
|
107
142
|
end
|
108
|
-
end
|
143
|
+
end
|