bioinform 0.1.17 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Bioinform
|
|
2
|
+
class Error < ::StandardError
|
|
3
|
+
end
|
|
4
|
+
|
|
5
|
+
class ValidationError < Error
|
|
6
|
+
attr_reader :validation_errors
|
|
7
|
+
|
|
8
|
+
def initialize(msg, options = {})
|
|
9
|
+
super(msg)
|
|
10
|
+
@validation_errors = options.fetch(:validation_errors, [])
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_s
|
|
14
|
+
"#{super} (#{@validation_errors.join('; ')})"
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/lib/bioinform/formatters.rb
CHANGED
|
@@ -1,2 +1,4 @@
|
|
|
1
|
-
require_relative 'formatters/
|
|
2
|
-
require_relative 'formatters/
|
|
1
|
+
require_relative 'formatters/motif_formatter'
|
|
2
|
+
require_relative 'formatters/pretty_matrix_formatter'
|
|
3
|
+
require_relative 'formatters/transfac_formatter'
|
|
4
|
+
require_relative 'formatters/consensus_formatter'
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
require_relative '../alphabet'
|
|
2
|
+
|
|
3
|
+
module Bioinform
|
|
4
|
+
class ConsensusFormatter
|
|
5
|
+
|
|
6
|
+
# ConsensusFormatter.new{|pos, el, nucleotide_index| el == pos.max }
|
|
7
|
+
def initialize(&block)
|
|
8
|
+
raise Error, 'block is necessary to create an instance of ConsensusFormatter' unless block_given?
|
|
9
|
+
@block = block
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Simplest consensus formatter which takes into account only maximal elements
|
|
13
|
+
def self.by_maximal_elements
|
|
14
|
+
self.new{|pos, el, nucleotide_index| el == pos.max }
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_string(pm)
|
|
19
|
+
pm.each_position.map{|pos| iupac_letter_by_position(pos) }.join
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def nucleotide_indices_by_position(pos)
|
|
23
|
+
pos.each_index.select{|nucleotide_index|
|
|
24
|
+
@block.call(pos, pos[nucleotide_index], nucleotide_index)
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def iupac_letter_by_position(pos)
|
|
29
|
+
nucleotide_indices = nucleotide_indices_by_position(pos)
|
|
30
|
+
Bioinform::IUPAC::IUPACLettersByNucleotideIndices[nucleotide_indices]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private :nucleotide_indices_by_position, :iupac_letter_by_position
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
module Bioinform
|
|
2
|
+
class MotifFormatter
|
|
3
|
+
attr_reader :with_name, :nucleotides_in, :precision, :with_nucleotide_header, :with_position_header
|
|
4
|
+
|
|
5
|
+
def initialize(options = {})
|
|
6
|
+
@with_name = options.fetch(:with_name, :auto)
|
|
7
|
+
@nucleotides_in = options.fetch(:nucleotides_in, :columns).to_sym
|
|
8
|
+
@precision = options.fetch(:precision, false)
|
|
9
|
+
@with_nucleotide_header = options.fetch(:with_nucleotide_header, false)
|
|
10
|
+
@with_position_header = options.fetch(:with_position_header, false)
|
|
11
|
+
raise Error, "`with_name` can be either `true` or `false` or `:auto` but was `#{@with_name}`" unless [true, false, :auto].include?(@with_name)
|
|
12
|
+
raise Error, "`nucleotides_in` can be either `:rows` or `:columns` but was `#{@nucleotides_in}`" unless [:rows, :columns].include?(@nucleotides_in)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def format_name(motif)
|
|
16
|
+
case @with_name
|
|
17
|
+
when true
|
|
18
|
+
raise Error, "Motif doesn't respond to #name" unless motif.respond_to?(:name)
|
|
19
|
+
">#{motif.name}\n"
|
|
20
|
+
when false
|
|
21
|
+
""
|
|
22
|
+
when :auto
|
|
23
|
+
(motif.respond_to?(:name) && motif.name && !motif.name.strip.empty?) ? ">#{motif.name}\n" : ""
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def element_rounded(el)
|
|
28
|
+
precision ? sprintf("%.#{precision}g", el) : el.to_s
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def position_index_formatted(pos)
|
|
32
|
+
sprintf('%02d', pos)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private :element_rounded, :position_index_formatted
|
|
36
|
+
|
|
37
|
+
def format_matrix(motif)
|
|
38
|
+
result = ""
|
|
39
|
+
result << "\t" if with_nucleotide_header && with_position_header
|
|
40
|
+
|
|
41
|
+
case @nucleotides_in
|
|
42
|
+
when :columns
|
|
43
|
+
if with_nucleotide_header
|
|
44
|
+
result << motif.alphabet.each_letter.to_a.join("\t") << "\n"
|
|
45
|
+
end
|
|
46
|
+
motif.each_position.with_index do |pos, pos_index|
|
|
47
|
+
result << "\n" if pos_index != 0
|
|
48
|
+
result << "#{position_index_formatted(pos_index + 1)}\t" if with_position_header
|
|
49
|
+
result << pos.map{|el| element_rounded(el) }.join("\t")
|
|
50
|
+
end
|
|
51
|
+
when :rows
|
|
52
|
+
if with_position_header
|
|
53
|
+
result << (1..motif.length).map{|pos| position_index_formatted(pos) }.join("\t") << "\n"
|
|
54
|
+
end
|
|
55
|
+
motif.alphabet.each_letter.with_index do |letter, letter_index|
|
|
56
|
+
result << "\n" if letter_index != 0
|
|
57
|
+
result << "#{letter}\t" if with_nucleotide_header
|
|
58
|
+
result << motif.matrix.transpose[letter_index].map{|el| element_rounded(el) }.join("\t")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
result
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def format(motif)
|
|
65
|
+
format_name(motif) + format_matrix(motif)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require_relative 'motif_formatter'
|
|
2
|
+
|
|
3
|
+
module Bioinform
|
|
4
|
+
class PrettyMatrixFormatter
|
|
5
|
+
attr_reader :with_name, :letters_as_rows
|
|
6
|
+
|
|
7
|
+
def initialize(options = {})
|
|
8
|
+
@with_name = options.fetch(:with_name, true)
|
|
9
|
+
@letters_as_rows = options.fetch(:letters_as_rows, false)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def header
|
|
13
|
+
%w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def optional_name(motif)
|
|
17
|
+
(@with_name && motif.name) ? (motif.name + "\n") : ''
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def matrix_string(motif)
|
|
21
|
+
matrix_rows = motif.each_position.map do |position|
|
|
22
|
+
position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
matrix_str = matrix_rows.join("\n")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def format(motif)
|
|
29
|
+
raise Error, "PM doesn't respond to #name. Use formatter with option `with_name: false`" if @with_name && !motif.respond_to?(:name)
|
|
30
|
+
return MotifFormatter.new(with_name: @with_name, nucleotides_in: (@letters_as_rows ? :rows : :columns)).format(motif) if @letters_as_rows
|
|
31
|
+
optional_name(motif) + header + matrix_string(motif)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private :header, :optional_name, :matrix_string
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -1,39 +1,31 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
default_options = {with_name: true, letters_as_rows: false}
|
|
8
|
-
@options = default_options.merge(options)
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
def name
|
|
12
|
-
motif.name
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
def header
|
|
16
|
-
if options[:with_name] && name
|
|
17
|
-
"ID #{name}\nBF StubSpeciesName\nP0\tA\tC\tG\tT\n"
|
|
18
|
-
else
|
|
19
|
-
raise 'Transfac should have the name field'
|
|
1
|
+
module Bioinform
|
|
2
|
+
class TransfacFormatter
|
|
3
|
+
attr_accessor :with_name
|
|
4
|
+
|
|
5
|
+
def initialize(options = {})
|
|
6
|
+
@with_name = options.fetch(:with_name, true)
|
|
20
7
|
end
|
|
8
|
+
|
|
9
|
+
def header(motif)
|
|
10
|
+
if @with_name && motif.name
|
|
11
|
+
"ID #{motif.name}\nBF StubSpeciesName\nP0\tA\tC\tG\tT\n"
|
|
12
|
+
else
|
|
13
|
+
raise 'Transfac should have the name field'
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def matrix_string(motif)
|
|
18
|
+
motif.each_position.map.with_index{|pos,ind|
|
|
19
|
+
line_number = ind.to_s
|
|
20
|
+
line_number = (line_number.size == 1) ? "0#{line_number}" : line_number
|
|
21
|
+
line_number + ' ' + pos.join("\t")
|
|
22
|
+
}.join("\n")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def format(motif)
|
|
26
|
+
header(motif) + matrix_string(motif) + "\nXX\n//"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private :header, :matrix_string
|
|
21
30
|
end
|
|
22
|
-
|
|
23
|
-
def matrix_string
|
|
24
|
-
motif.each_position.map.with_index{|pos,ind|
|
|
25
|
-
line_number = ind.to_s
|
|
26
|
-
line_number = (line_number.size == 1) ? "0#{line_number}" : line_number
|
|
27
|
-
line_number + ' ' + pos.join("\t")
|
|
28
|
-
}.join("\n")
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def footer
|
|
32
|
-
#"XX\n//\n"
|
|
33
|
-
"\nXX\n//"
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
def to_s
|
|
37
|
-
header + matrix_string + footer
|
|
38
|
-
end
|
|
39
|
-
end
|
|
31
|
+
end
|
data/lib/bioinform/parsers.rb
CHANGED
|
@@ -1,9 +1,2 @@
|
|
|
1
|
-
require_relative 'parsers/parser'
|
|
2
|
-
require_relative 'parsers/trivial_parser'
|
|
3
|
-
require_relative 'parsers/yaml_parser'
|
|
4
|
-
require_relative 'parsers/string_parser'
|
|
5
|
-
require_relative 'parsers/string_fantom_parser'
|
|
6
|
-
require_relative 'parsers/splittable_parser'
|
|
7
|
-
require_relative 'parsers/jaspar_parser'
|
|
8
|
-
|
|
9
1
|
require_relative 'parsers/matrix_parser'
|
|
2
|
+
require_relative 'parsers/motif_splitter'
|
|
@@ -1,32 +1,63 @@
|
|
|
1
|
-
|
|
1
|
+
require_relative '../errors'
|
|
2
2
|
|
|
3
|
-
module Bioinform
|
|
3
|
+
module Bioinform
|
|
4
4
|
class MatrixParser
|
|
5
|
+
# fix_nucleotides_number -- raises if matrix has not enough nucleotide columns
|
|
6
|
+
attr_reader :has_name, :name_pattern, :has_header_row, :has_header_column, :nucleotides_in, :fix_nucleotides_number
|
|
5
7
|
def initialize(options = {})
|
|
6
|
-
@has_name = options.fetch(:has_name,
|
|
7
|
-
@name_pattern = options.fetch(:name_pattern, /^>?\s*(?<name>[^\t\r\n]
|
|
8
|
+
@has_name = options.fetch(:has_name, :auto)
|
|
9
|
+
@name_pattern = options.fetch(:name_pattern, /^>?\s*(?<name>[^-+\d.\t\r\n][^\t\r\n]*).*$/)
|
|
8
10
|
@has_header_row = options.fetch(:has_header_row, false)
|
|
9
11
|
@has_header_column = options.fetch(:has_header_column, false)
|
|
10
|
-
@nucleotides_in = options.fetch(:nucleotides_in, :
|
|
12
|
+
@nucleotides_in = options.fetch(:nucleotides_in, :auto)
|
|
13
|
+
@fix_nucleotides_number = options.fetch(:fix_nucleotides_number, 4)
|
|
11
14
|
|
|
12
|
-
raise ':nucleotides_in option should be either :rows or :columns' unless [:rows, :columns].include?(@nucleotides_in)
|
|
15
|
+
raise Error, ':nucleotides_in option should be either :rows or :columns' unless [:rows, :columns, :auto].include?(@nucleotides_in)
|
|
13
16
|
end
|
|
14
17
|
|
|
18
|
+
def need_transpose?(matrix)
|
|
19
|
+
(matrix.size == @fix_nucleotides_number) && (matrix.first.size != 4)
|
|
20
|
+
end
|
|
21
|
+
private :need_transpose?
|
|
22
|
+
|
|
15
23
|
def parse!(input)
|
|
16
|
-
lines = input.lines
|
|
17
|
-
if @has_name
|
|
24
|
+
lines = input.strip.lines.to_a
|
|
25
|
+
if @has_name == :auto
|
|
26
|
+
match = lines.first.match(@name_pattern)
|
|
27
|
+
if match
|
|
28
|
+
lines.shift
|
|
29
|
+
name = match[:name]
|
|
30
|
+
end
|
|
31
|
+
elsif @has_name == false
|
|
32
|
+
name = nil
|
|
33
|
+
else
|
|
18
34
|
match = lines.shift.match(@name_pattern)
|
|
19
|
-
raise
|
|
35
|
+
raise Error, "Name pattern doesn't match" unless match
|
|
20
36
|
name = match[:name]
|
|
21
37
|
end
|
|
22
38
|
lines.shift if @has_header_row
|
|
23
|
-
matrix = lines.map(&:
|
|
39
|
+
matrix = lines.map(&:rstrip).reject(&:empty?).map{|line| line.split }
|
|
24
40
|
matrix = matrix.map{|row| row.drop(1) } if @has_header_column
|
|
25
41
|
matrix = matrix.map{|row| row.map{|el| Float(el) } }
|
|
26
42
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
43
|
+
case @nucleotides_in
|
|
44
|
+
when :columns
|
|
45
|
+
matrix = matrix
|
|
46
|
+
when :rows
|
|
47
|
+
matrix = matrix.transpose
|
|
48
|
+
when :auto
|
|
49
|
+
if @fix_nucleotides_number && need_transpose?(matrix)
|
|
50
|
+
matrix = matrix.transpose
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
if @fix_nucleotides_number
|
|
55
|
+
raise Error, 'Not enough nucleotides in a matrix' unless matrix.all?{|pos| pos.size >= @fix_nucleotides_number}
|
|
56
|
+
matrix = matrix.map{|pos| pos.first(@fix_nucleotides_number) }
|
|
57
|
+
end
|
|
58
|
+
{matrix: matrix, name: name}
|
|
59
|
+
rescue => e
|
|
60
|
+
raise Error, e.message
|
|
30
61
|
end
|
|
31
62
|
|
|
32
63
|
def parse(input)
|
|
@@ -38,28 +69,5 @@ module Bioinform
|
|
|
38
69
|
rescue
|
|
39
70
|
false
|
|
40
71
|
end
|
|
41
|
-
|
|
42
|
-
class TemporaryWrapper
|
|
43
|
-
attr_reader :input
|
|
44
|
-
include Bioinform::Parser::ClassMethods
|
|
45
|
-
include Bioinform::Parser::SingleMotifParser::ClassMethods
|
|
46
|
-
def initialize(parser)
|
|
47
|
-
@parser, input = parser, input
|
|
48
|
-
end
|
|
49
|
-
def parse
|
|
50
|
-
@parser.parse(@input)
|
|
51
|
-
end
|
|
52
|
-
def parse!
|
|
53
|
-
@parser.parse!(@input)
|
|
54
|
-
end
|
|
55
|
-
def new(input)
|
|
56
|
-
@input = input
|
|
57
|
-
self
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def wrapper
|
|
62
|
-
TemporaryWrapper.new(self)
|
|
63
|
-
end
|
|
64
72
|
end
|
|
65
73
|
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
module Bioinform
|
|
2
|
+
#
|
|
3
|
+
# MotifSpliiter is designed to split text into chunks with separate motifs.
|
|
4
|
+
# It enumerates input line by line.
|
|
5
|
+
# One can supply two options:
|
|
6
|
+
# * pattern for splitter `splitter_pattern`
|
|
7
|
+
# * `start_motif_pattern` which can determine start of motif but doesn't
|
|
8
|
+
# match within motif
|
|
9
|
+
# If specified pattern is nil, corresponding splitting is not applied.
|
|
10
|
+
# Paterns are applied by `#===` operator, thus both regexp or a Proc are
|
|
11
|
+
# valid options. Proc accepts a line and should return true if line is
|
|
12
|
+
# a splitter or is a motif start.
|
|
13
|
+
#
|
|
14
|
+
# Splitter method `#split` returns an array of strings. Each of returned
|
|
15
|
+
# strings represents a motif. Motifs exclude splitter but include motif
|
|
16
|
+
# start, thus one can divide input both by lines which will be dismissed
|
|
17
|
+
# and by lines which will be retained.
|
|
18
|
+
#
|
|
19
|
+
class MotifSplitter
|
|
20
|
+
attr_reader :start_motif_pattern, :spliiter
|
|
21
|
+
|
|
22
|
+
def initialize(options={})
|
|
23
|
+
@start_motif_pattern = options.fetch(:start_motif_pattern, /^\s*([^-+\s\d.]+|>.*)/)
|
|
24
|
+
@splitter_pattern = options.fetch(:splitter_pattern, /^\s*$/)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def parts_divided_by_splitter(input)
|
|
28
|
+
return input unless @splitter_pattern
|
|
29
|
+
input.each_line.chunk{|line| @splitter_pattern === line }.reject{|is_splitter, lines| is_splitter}.map{|is_splitter, lines| lines.join}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def parts_divided_by_motif_starts(input)
|
|
33
|
+
return input unless @start_motif_pattern
|
|
34
|
+
input.each_line.slice_before(@start_motif_pattern).map{|motif_lines| motif_lines.join }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private :parts_divided_by_splitter, :parts_divided_by_motif_starts
|
|
38
|
+
|
|
39
|
+
def split(input)
|
|
40
|
+
parts_divided_by_splitter(input).map{|chunk|
|
|
41
|
+
parts_divided_by_motif_starts(chunk)
|
|
42
|
+
}.flatten.map(&:strip).reject(&:empty?)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
data/lib/bioinform/support.rb
CHANGED
|
@@ -1,18 +1,50 @@
|
|
|
1
|
-
require_relative 'support/
|
|
2
|
-
require_relative 'support/third_part/active_support/core_ext/hash/indifferent_access'
|
|
1
|
+
require_relative 'support/strip_doc'
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
module Bioinform
|
|
4
|
+
module Support
|
|
5
|
+
# element_indices([:A,:C,:G,:T]) ==> {:A=>0, :C=>1, :G=>2, :T=>3}
|
|
6
|
+
def self.element_indices(arr)
|
|
7
|
+
arr.each_with_index.inject({}) {|hsh, (letter, index)| hsh.merge(letter => index) }
|
|
8
|
+
end
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
# hash_keys_permuted([0,1], :A) ==> {[0,1] => :A, [1,0] => :A}
|
|
11
|
+
def self.hash_keys_permuted(key, value)
|
|
12
|
+
key.permutation.inject({}){|hsh, perm| hsh.merge(perm => value) }
|
|
13
|
+
end
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
+
# with_key_permutations({[0,1] => :A, [0,2] => :T}) ==> {[0,1] => :A, [1,0] => :A, [0,2] => :T, [2,0]=>:T}
|
|
16
|
+
def self.with_key_permutations(hash)
|
|
17
|
+
hash.inject({}) {|h, (indices, letter)| h.merge( hash_keys_permuted(indices, letter) ) }
|
|
18
|
+
end
|
|
15
19
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
|
|
21
|
+
# various_key_cases({'a' => 2, 'C' => 3, :g =>5, :T => 8}) ==> {'a' => 2, 'A' => 2, 'c' => 3, 'C' => 3, :g =>5, :G => 5, :T => 8, :t=>8}
|
|
22
|
+
def self.various_key_cases(hash)
|
|
23
|
+
hash.inject({}){|h,(k,v)| h.merge(k.downcase => v, k.upcase => v) }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# various_key_types({'a' => 2, 'C' => 3, :g =>5, :T => 8}) ==> {'a' => 2, :a => 2, 'C' => 3, :C => 3, :g =>5, 'g' => 5, :T => 8, 'T'=>8}
|
|
27
|
+
def self.various_key_types(hash)
|
|
28
|
+
hash.inject({}){|h,(k,v)| h.merge(k.to_s => v, k.to_sym => v) }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.various_key_case_types(hash)
|
|
32
|
+
various_key_types(various_key_cases(hash))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# various_key_value_cases({:A => :T}) ==> {:A => :T, :a => :t}
|
|
37
|
+
def self.various_key_value_cases(hash)
|
|
38
|
+
hash.inject({}){|h,(k,v)| h.merge(k.upcase => v.upcase, k.downcase => v.downcase) }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# various_key_value_types({:A => :T}) ==> {:A => :T, 'A' => 'T'}
|
|
42
|
+
def self.various_key_value_types(hash)
|
|
43
|
+
hash.inject({}){|h,(k,v)| h.merge(k.to_s => v.to_s, k.to_sym => v.to_sym) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.various_key_value_case_types(hash)
|
|
47
|
+
various_key_value_types(various_key_value_cases(hash))
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|