bioinform 0.1.17 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/LICENSE +0 -1
  4. data/README.md +1 -1
  5. data/TODO.txt +23 -30
  6. data/bin/convert_motif +4 -0
  7. data/bin/pcm2pwm +1 -1
  8. data/bin/split_motifs +1 -1
  9. data/bioinform.gemspec +0 -2
  10. data/lib/bioinform.rb +54 -16
  11. data/lib/bioinform/alphabet.rb +85 -0
  12. data/lib/bioinform/background.rb +90 -0
  13. data/lib/bioinform/cli.rb +1 -2
  14. data/lib/bioinform/cli/convert_motif.rb +52 -17
  15. data/lib/bioinform/cli/pcm2pwm.rb +32 -26
  16. data/lib/bioinform/cli/split_motifs.rb +31 -30
  17. data/lib/bioinform/conversion_algorithms.rb +6 -0
  18. data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
  19. data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
  20. data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
  21. data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
  22. data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
  23. data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
  24. data/lib/bioinform/data_models.rb +1 -7
  25. data/lib/bioinform/data_models/named_model.rb +38 -0
  26. data/lib/bioinform/data_models/pcm.rb +18 -28
  27. data/lib/bioinform/data_models/pm.rb +73 -170
  28. data/lib/bioinform/data_models/ppm.rb +11 -24
  29. data/lib/bioinform/data_models/pwm.rb +30 -56
  30. data/lib/bioinform/errors.rb +17 -0
  31. data/lib/bioinform/formatters.rb +4 -2
  32. data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
  33. data/lib/bioinform/formatters/motif_formatter.rb +69 -0
  34. data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
  35. data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
  36. data/lib/bioinform/parsers.rb +1 -8
  37. data/lib/bioinform/parsers/matrix_parser.rb +44 -36
  38. data/lib/bioinform/parsers/motif_splitter.rb +45 -0
  39. data/lib/bioinform/support.rb +46 -14
  40. data/lib/bioinform/support/strip_doc.rb +1 -1
  41. data/lib/bioinform/version.rb +1 -1
  42. data/spec/alphabet_spec.rb +79 -0
  43. data/spec/background_spec.rb +57 -0
  44. data/spec/cli/cli_spec.rb +6 -6
  45. data/spec/cli/convert_motif_spec.rb +88 -88
  46. data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
  47. data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
  48. data/spec/cli/pcm2pwm_spec.rb +22 -23
  49. data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
  50. data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
  51. data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
  52. data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
  53. data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
  54. data/spec/cli/split_motifs_spec.rb +6 -21
  55. data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
  56. data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
  57. data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
  58. data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
  59. data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
  60. data/spec/data_models/named_model_spec.rb +41 -0
  61. data/spec/data_models/pcm_spec.rb +114 -45
  62. data/spec/data_models/pm_spec.rb +132 -333
  63. data/spec/data_models/ppm_spec.rb +47 -44
  64. data/spec/data_models/pwm_spec.rb +85 -77
  65. data/spec/fabricators/motif_formats_fabricator.rb +116 -116
  66. data/spec/formatters/consensus_formatter_spec.rb +26 -0
  67. data/spec/formatters/raw_formatter_spec.rb +169 -0
  68. data/spec/parsers/matrix_parser_spec.rb +216 -0
  69. data/spec/parsers/motif_splitter_spec.rb +87 -0
  70. data/spec/spec_helper.rb +2 -2
  71. data/spec/spec_helper_source.rb +25 -5
  72. data/spec/support_spec.rb +31 -0
  73. metadata +43 -124
  74. data/bin/merge_into_collection +0 -4
  75. data/lib/bioinform/cli/merge_into_collection.rb +0 -80
  76. data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
  77. data/lib/bioinform/data_models/collection.rb +0 -75
  78. data/lib/bioinform/data_models/motif.rb +0 -56
  79. data/lib/bioinform/formatters/raw_formatter.rb +0 -41
  80. data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
  81. data/lib/bioinform/parsers/parser.rb +0 -92
  82. data/lib/bioinform/parsers/splittable_parser.rb +0 -57
  83. data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
  84. data/lib/bioinform/parsers/string_parser.rb +0 -72
  85. data/lib/bioinform/parsers/trivial_parser.rb +0 -34
  86. data/lib/bioinform/parsers/yaml_parser.rb +0 -35
  87. data/lib/bioinform/support/advanced_scan.rb +0 -8
  88. data/lib/bioinform/support/array_product.rb +0 -6
  89. data/lib/bioinform/support/array_zip.rb +0 -6
  90. data/lib/bioinform/support/collect_hash.rb +0 -7
  91. data/lib/bioinform/support/deep_dup.rb +0 -5
  92. data/lib/bioinform/support/delete_many.rb +0 -14
  93. data/lib/bioinform/support/inverf.rb +0 -13
  94. data/lib/bioinform/support/multiline_squish.rb +0 -6
  95. data/lib/bioinform/support/parameters.rb +0 -28
  96. data/lib/bioinform/support/partial_sums.rb +0 -16
  97. data/lib/bioinform/support/same_by.rb +0 -12
  98. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
  99. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
  100. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
  101. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
  102. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
  103. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
  104. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
  105. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
  106. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
  107. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
  108. data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
  109. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
  110. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
  111. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
  112. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
  113. data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
  114. data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
  115. data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
  116. data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
  117. data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
  118. data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
  119. data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
  120. data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
  121. data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
  122. data/spec/cli/data/split_motifs/collection.yaml +0 -188
  123. data/spec/cli/merge_into_collection_spec.rb +0 -100
  124. data/spec/data_models/collection_spec.rb +0 -98
  125. data/spec/data_models/motif_spec.rb +0 -224
  126. data/spec/fabricators/collection_fabricator.rb +0 -8
  127. data/spec/fabricators/motif_fabricator.rb +0 -33
  128. data/spec/fabricators/pcm_fabricator.rb +0 -25
  129. data/spec/fabricators/pm_fabricator.rb +0 -52
  130. data/spec/fabricators/ppm_fabricator.rb +0 -14
  131. data/spec/fabricators/pwm_fabricator.rb +0 -16
  132. data/spec/parsers/parser_spec.rb +0 -152
  133. data/spec/parsers/string_fantom_parser_spec.rb +0 -70
  134. data/spec/parsers/string_parser_spec.rb +0 -77
  135. data/spec/parsers/trivial_parser_spec.rb +0 -64
  136. data/spec/parsers/yaml_parser_spec.rb +0 -50
  137. data/spec/support/advanced_scan_spec.rb +0 -32
  138. data/spec/support/array_product_spec.rb +0 -15
  139. data/spec/support/array_zip_spec.rb +0 -15
  140. data/spec/support/collect_hash_spec.rb +0 -15
  141. data/spec/support/delete_many_spec.rb +0 -44
  142. data/spec/support/inverf_spec.rb +0 -19
  143. data/spec/support/multiline_squish_spec.rb +0 -25
  144. data/spec/support/partial_sums_spec.rb +0 -30
  145. data/spec/support/same_by_spec.rb +0 -36
@@ -0,0 +1,17 @@
1
+ module Bioinform
2
+ class Error < ::StandardError
3
+ end
4
+
5
+ class ValidationError < Error
6
+ attr_reader :validation_errors
7
+
8
+ def initialize(msg, options = {})
9
+ super(msg)
10
+ @validation_errors = options.fetch(:validation_errors, [])
11
+ end
12
+
13
+ def to_s
14
+ "#{super} (#{@validation_errors.join('; ')})"
15
+ end
16
+ end
17
+ end
@@ -1,2 +1,4 @@
1
- require_relative 'formatters/raw_formatter'
2
- require_relative 'formatters/transfac_formatter'
1
+ require_relative 'formatters/motif_formatter'
2
+ require_relative 'formatters/pretty_matrix_formatter'
3
+ require_relative 'formatters/transfac_formatter'
4
+ require_relative 'formatters/consensus_formatter'
@@ -0,0 +1,35 @@
1
+ require_relative '../alphabet'
2
+
3
+ module Bioinform
4
+ class ConsensusFormatter
5
+
6
+ # ConsensusFormatter.new{|pos, el, nucleotide_index| el == pos.max }
7
+ def initialize(&block)
8
+ raise Error, 'block is necessary to create an instance of ConsensusFormatter' unless block_given?
9
+ @block = block
10
+ end
11
+
12
+ # Simplest consensus formatter which takes into account only maximal elements
13
+ def self.by_maximal_elements
14
+ self.new{|pos, el, nucleotide_index| el == pos.max }
15
+ end
16
+
17
+
18
+ def format_string(pm)
19
+ pm.each_position.map{|pos| iupac_letter_by_position(pos) }.join
20
+ end
21
+
22
+ def nucleotide_indices_by_position(pos)
23
+ pos.each_index.select{|nucleotide_index|
24
+ @block.call(pos, pos[nucleotide_index], nucleotide_index)
25
+ }
26
+ end
27
+
28
+ def iupac_letter_by_position(pos)
29
+ nucleotide_indices = nucleotide_indices_by_position(pos)
30
+ Bioinform::IUPAC::IUPACLettersByNucleotideIndices[nucleotide_indices]
31
+ end
32
+
33
+ private :nucleotide_indices_by_position, :iupac_letter_by_position
34
+ end
35
+ end
@@ -0,0 +1,69 @@
1
+ module Bioinform
2
+ class MotifFormatter
3
+ attr_reader :with_name, :nucleotides_in, :precision, :with_nucleotide_header, :with_position_header
4
+
5
+ def initialize(options = {})
6
+ @with_name = options.fetch(:with_name, :auto)
7
+ @nucleotides_in = options.fetch(:nucleotides_in, :columns).to_sym
8
+ @precision = options.fetch(:precision, false)
9
+ @with_nucleotide_header = options.fetch(:with_nucleotide_header, false)
10
+ @with_position_header = options.fetch(:with_position_header, false)
11
+ raise Error, "`with_name` can be either `true` or `false` or `:auto` but was `#{@with_name}`" unless [true, false, :auto].include?(@with_name)
12
+ raise Error, "`nucleotides_in` can be either `:rows` or `:columns` but was `#{@nucleotides_in}`" unless [:rows, :columns].include?(@nucleotides_in)
13
+ end
14
+
15
+ def format_name(motif)
16
+ case @with_name
17
+ when true
18
+ raise Error, "Motif doesn't respond to #name" unless motif.respond_to?(:name)
19
+ ">#{motif.name}\n"
20
+ when false
21
+ ""
22
+ when :auto
23
+ (motif.respond_to?(:name) && motif.name && !motif.name.strip.empty?) ? ">#{motif.name}\n" : ""
24
+ end
25
+ end
26
+
27
+ def element_rounded(el)
28
+ precision ? sprintf("%.#{precision}g", el) : el.to_s
29
+ end
30
+
31
+ def position_index_formatted(pos)
32
+ sprintf('%02d', pos)
33
+ end
34
+
35
+ private :element_rounded, :position_index_formatted
36
+
37
+ def format_matrix(motif)
38
+ result = ""
39
+ result << "\t" if with_nucleotide_header && with_position_header
40
+
41
+ case @nucleotides_in
42
+ when :columns
43
+ if with_nucleotide_header
44
+ result << motif.alphabet.each_letter.to_a.join("\t") << "\n"
45
+ end
46
+ motif.each_position.with_index do |pos, pos_index|
47
+ result << "\n" if pos_index != 0
48
+ result << "#{position_index_formatted(pos_index + 1)}\t" if with_position_header
49
+ result << pos.map{|el| element_rounded(el) }.join("\t")
50
+ end
51
+ when :rows
52
+ if with_position_header
53
+ result << (1..motif.length).map{|pos| position_index_formatted(pos) }.join("\t") << "\n"
54
+ end
55
+ motif.alphabet.each_letter.with_index do |letter, letter_index|
56
+ result << "\n" if letter_index != 0
57
+ result << "#{letter}\t" if with_nucleotide_header
58
+ result << motif.matrix.transpose[letter_index].map{|el| element_rounded(el) }.join("\t")
59
+ end
60
+ end
61
+ result
62
+ end
63
+
64
+ def format(motif)
65
+ format_name(motif) + format_matrix(motif)
66
+ end
67
+
68
+ end
69
+ end
@@ -0,0 +1,36 @@
1
+ require_relative 'motif_formatter'
2
+
3
+ module Bioinform
4
+ class PrettyMatrixFormatter
5
+ attr_reader :with_name, :letters_as_rows
6
+
7
+ def initialize(options = {})
8
+ @with_name = options.fetch(:with_name, true)
9
+ @letters_as_rows = options.fetch(:letters_as_rows, false)
10
+ end
11
+
12
+ def header
13
+ %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
14
+ end
15
+
16
+ def optional_name(motif)
17
+ (@with_name && motif.name) ? (motif.name + "\n") : ''
18
+ end
19
+
20
+ def matrix_string(motif)
21
+ matrix_rows = motif.each_position.map do |position|
22
+ position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
23
+ end
24
+
25
+ matrix_str = matrix_rows.join("\n")
26
+ end
27
+
28
+ def format(motif)
29
+ raise Error, "PM doesn't respond to #name. Use formatter with option `with_name: false`" if @with_name && !motif.respond_to?(:name)
30
+ return MotifFormatter.new(with_name: @with_name, nucleotides_in: (@letters_as_rows ? :rows : :columns)).format(motif) if @letters_as_rows
31
+ optional_name(motif) + header + matrix_string(motif)
32
+ end
33
+
34
+ private :header, :optional_name, :matrix_string
35
+ end
36
+ end
@@ -1,39 +1,31 @@
1
- class TransfacFormatter
2
- attr_accessor :motif, :options
3
-
4
- def initialize(motif, options = {})
5
- @motif = motif
6
-
7
- default_options = {with_name: true, letters_as_rows: false}
8
- @options = default_options.merge(options)
9
- end
10
-
11
- def name
12
- motif.name
13
- end
14
-
15
- def header
16
- if options[:with_name] && name
17
- "ID #{name}\nBF StubSpeciesName\nP0\tA\tC\tG\tT\n"
18
- else
19
- raise 'Transfac should have the name field'
1
+ module Bioinform
2
+ class TransfacFormatter
3
+ attr_accessor :with_name
4
+
5
+ def initialize(options = {})
6
+ @with_name = options.fetch(:with_name, true)
20
7
  end
8
+
9
+ def header(motif)
10
+ if @with_name && motif.name
11
+ "ID #{motif.name}\nBF StubSpeciesName\nP0\tA\tC\tG\tT\n"
12
+ else
13
+ raise 'Transfac should have the name field'
14
+ end
15
+ end
16
+
17
+ def matrix_string(motif)
18
+ motif.each_position.map.with_index{|pos,ind|
19
+ line_number = ind.to_s
20
+ line_number = (line_number.size == 1) ? "0#{line_number}" : line_number
21
+ line_number + ' ' + pos.join("\t")
22
+ }.join("\n")
23
+ end
24
+
25
+ def format(motif)
26
+ header(motif) + matrix_string(motif) + "\nXX\n//"
27
+ end
28
+
29
+ private :header, :matrix_string
21
30
  end
22
-
23
- def matrix_string
24
- motif.each_position.map.with_index{|pos,ind|
25
- line_number = ind.to_s
26
- line_number = (line_number.size == 1) ? "0#{line_number}" : line_number
27
- line_number + ' ' + pos.join("\t")
28
- }.join("\n")
29
- end
30
-
31
- def footer
32
- #"XX\n//\n"
33
- "\nXX\n//"
34
- end
35
-
36
- def to_s
37
- header + matrix_string + footer
38
- end
39
- end
31
+ end
@@ -1,9 +1,2 @@
1
- require_relative 'parsers/parser'
2
- require_relative 'parsers/trivial_parser'
3
- require_relative 'parsers/yaml_parser'
4
- require_relative 'parsers/string_parser'
5
- require_relative 'parsers/string_fantom_parser'
6
- require_relative 'parsers/splittable_parser'
7
- require_relative 'parsers/jaspar_parser'
8
-
9
1
  require_relative 'parsers/matrix_parser'
2
+ require_relative 'parsers/motif_splitter'
@@ -1,32 +1,63 @@
1
- require 'ostruct'
1
+ require_relative '../errors'
2
2
 
3
- module Bioinform
3
+ module Bioinform
4
4
  class MatrixParser
5
+ # fix_nucleotides_number -- raises if matrix has not enough nucleotide columns
6
+ attr_reader :has_name, :name_pattern, :has_header_row, :has_header_column, :nucleotides_in, :fix_nucleotides_number
5
7
  def initialize(options = {})
6
- @has_name = options.fetch(:has_name, true)
7
- @name_pattern = options.fetch(:name_pattern, /^>?\s*(?<name>[^\t\r\n]+).*$/)
8
+ @has_name = options.fetch(:has_name, :auto)
9
+ @name_pattern = options.fetch(:name_pattern, /^>?\s*(?<name>[^-+\d.\t\r\n][^\t\r\n]*).*$/)
8
10
  @has_header_row = options.fetch(:has_header_row, false)
9
11
  @has_header_column = options.fetch(:has_header_column, false)
10
- @nucleotides_in = options.fetch(:nucleotides_in, :columns)
12
+ @nucleotides_in = options.fetch(:nucleotides_in, :auto)
13
+ @fix_nucleotides_number = options.fetch(:fix_nucleotides_number, 4)
11
14
 
12
- raise ':nucleotides_in option should be either :rows or :columns' unless [:rows, :columns].include?(@nucleotides_in)
15
+ raise Error, ':nucleotides_in option should be either :rows or :columns' unless [:rows, :columns, :auto].include?(@nucleotides_in)
13
16
  end
14
17
 
18
+ def need_transpose?(matrix)
19
+ (matrix.size == @fix_nucleotides_number) && (matrix.first.size != 4)
20
+ end
21
+ private :need_transpose?
22
+
15
23
  def parse!(input)
16
- lines = input.lines
17
- if @has_name
24
+ lines = input.strip.lines.to_a
25
+ if @has_name == :auto
26
+ match = lines.first.match(@name_pattern)
27
+ if match
28
+ lines.shift
29
+ name = match[:name]
30
+ end
31
+ elsif @has_name == false
32
+ name = nil
33
+ else
18
34
  match = lines.shift.match(@name_pattern)
19
- raise 'Name pattern doesn\'t match' unless match
35
+ raise Error, "Name pattern doesn't match" unless match
20
36
  name = match[:name]
21
37
  end
22
38
  lines.shift if @has_header_row
23
- matrix = lines.map(&:strip).reject(&:empty?).map{|line| line.split }
39
+ matrix = lines.map(&:rstrip).reject(&:empty?).map{|line| line.split }
24
40
  matrix = matrix.map{|row| row.drop(1) } if @has_header_column
25
41
  matrix = matrix.map{|row| row.map{|el| Float(el) } }
26
42
 
27
- matrix = matrix.transpose if @nucleotides_in == :rows
28
- # raise 'Matrix not valid' unless ! matrix.empty? && matrix.all?{|pos| pos.size == 4 }
29
- OpenStruct.new(matrix: matrix, name: name)
43
+ case @nucleotides_in
44
+ when :columns
45
+ matrix = matrix
46
+ when :rows
47
+ matrix = matrix.transpose
48
+ when :auto
49
+ if @fix_nucleotides_number && need_transpose?(matrix)
50
+ matrix = matrix.transpose
51
+ end
52
+ end
53
+
54
+ if @fix_nucleotides_number
55
+ raise Error, 'Not enough nucleotides in a matrix' unless matrix.all?{|pos| pos.size >= @fix_nucleotides_number}
56
+ matrix = matrix.map{|pos| pos.first(@fix_nucleotides_number) }
57
+ end
58
+ {matrix: matrix, name: name}
59
+ rescue => e
60
+ raise Error, e.message
30
61
  end
31
62
 
32
63
  def parse(input)
@@ -38,28 +69,5 @@ module Bioinform
38
69
  rescue
39
70
  false
40
71
  end
41
-
42
- class TemporaryWrapper
43
- attr_reader :input
44
- include Bioinform::Parser::ClassMethods
45
- include Bioinform::Parser::SingleMotifParser::ClassMethods
46
- def initialize(parser)
47
- @parser, input = parser, input
48
- end
49
- def parse
50
- @parser.parse(@input)
51
- end
52
- def parse!
53
- @parser.parse!(@input)
54
- end
55
- def new(input)
56
- @input = input
57
- self
58
- end
59
- end
60
-
61
- def wrapper
62
- TemporaryWrapper.new(self)
63
- end
64
72
  end
65
73
  end
@@ -0,0 +1,45 @@
1
+ module Bioinform
2
+ #
3
+ # MotifSpliiter is designed to split text into chunks with separate motifs.
4
+ # It enumerates input line by line.
5
+ # One can supply two options:
6
+ # * pattern for splitter `splitter_pattern`
7
+ # * `start_motif_pattern` which can determine start of motif but doesn't
8
+ # match within motif
9
+ # If specified pattern is nil, corresponding splitting is not applied.
10
+ # Paterns are applied by `#===` operator, thus both regexp or a Proc are
11
+ # valid options. Proc accepts a line and should return true if line is
12
+ # a splitter or is a motif start.
13
+ #
14
+ # Splitter method `#split` returns an array of strings. Each of returned
15
+ # strings represents a motif. Motifs exclude splitter but include motif
16
+ # start, thus one can divide input both by lines which will be dismissed
17
+ # and by lines which will be retained.
18
+ #
19
+ class MotifSplitter
20
+ attr_reader :start_motif_pattern, :spliiter
21
+
22
+ def initialize(options={})
23
+ @start_motif_pattern = options.fetch(:start_motif_pattern, /^\s*([^-+\s\d.]+|>.*)/)
24
+ @splitter_pattern = options.fetch(:splitter_pattern, /^\s*$/)
25
+ end
26
+
27
+ def parts_divided_by_splitter(input)
28
+ return input unless @splitter_pattern
29
+ input.each_line.chunk{|line| @splitter_pattern === line }.reject{|is_splitter, lines| is_splitter}.map{|is_splitter, lines| lines.join}
30
+ end
31
+
32
+ def parts_divided_by_motif_starts(input)
33
+ return input unless @start_motif_pattern
34
+ input.each_line.slice_before(@start_motif_pattern).map{|motif_lines| motif_lines.join }
35
+ end
36
+
37
+ private :parts_divided_by_splitter, :parts_divided_by_motif_starts
38
+
39
+ def split(input)
40
+ parts_divided_by_splitter(input).map{|chunk|
41
+ parts_divided_by_motif_starts(chunk)
42
+ }.flatten.map(&:strip).reject(&:empty?)
43
+ end
44
+ end
45
+ end
@@ -1,18 +1,50 @@
1
- require_relative 'support/third_part/active_support/core_ext/string/filters'
2
- require_relative 'support/third_part/active_support/core_ext/hash/indifferent_access'
1
+ require_relative 'support/strip_doc'
3
2
 
4
- require_relative 'support/collect_hash'
5
- require_relative 'support/delete_many'
6
- require_relative 'support/multiline_squish'
7
- require_relative 'support/same_by'
8
- require_relative 'support/inverf'
9
- require_relative 'support/deep_dup'
3
+ module Bioinform
4
+ module Support
5
+ # element_indices([:A,:C,:G,:T]) ==> {:A=>0, :C=>1, :G=>2, :T=>3}
6
+ def self.element_indices(arr)
7
+ arr.each_with_index.inject({}) {|hsh, (letter, index)| hsh.merge(letter => index) }
8
+ end
10
9
 
11
- require_relative 'support/partial_sums'
10
+ # hash_keys_permuted([0,1], :A) ==> {[0,1] => :A, [1,0] => :A}
11
+ def self.hash_keys_permuted(key, value)
12
+ key.permutation.inject({}){|hsh, perm| hsh.merge(perm => value) }
13
+ end
12
14
 
13
- require_relative 'support/array_zip'
14
- require_relative 'support/array_product'
15
+ # with_key_permutations({[0,1] => :A, [0,2] => :T}) ==> {[0,1] => :A, [1,0] => :A, [0,2] => :T, [2,0]=>:T}
16
+ def self.with_key_permutations(hash)
17
+ hash.inject({}) {|h, (indices, letter)| h.merge( hash_keys_permuted(indices, letter) ) }
18
+ end
15
19
 
16
- require_relative 'support/advanced_scan'
17
- require_relative 'support/parameters'
18
- require_relative 'support/strip_doc'
20
+
21
+ # various_key_cases({'a' => 2, 'C' => 3, :g =>5, :T => 8}) ==> {'a' => 2, 'A' => 2, 'c' => 3, 'C' => 3, :g =>5, :G => 5, :T => 8, :t=>8}
22
+ def self.various_key_cases(hash)
23
+ hash.inject({}){|h,(k,v)| h.merge(k.downcase => v, k.upcase => v) }
24
+ end
25
+
26
+ # various_key_types({'a' => 2, 'C' => 3, :g =>5, :T => 8}) ==> {'a' => 2, :a => 2, 'C' => 3, :C => 3, :g =>5, 'g' => 5, :T => 8, 'T'=>8}
27
+ def self.various_key_types(hash)
28
+ hash.inject({}){|h,(k,v)| h.merge(k.to_s => v, k.to_sym => v) }
29
+ end
30
+
31
+ def self.various_key_case_types(hash)
32
+ various_key_types(various_key_cases(hash))
33
+ end
34
+
35
+
36
+ # various_key_value_cases({:A => :T}) ==> {:A => :T, :a => :t}
37
+ def self.various_key_value_cases(hash)
38
+ hash.inject({}){|h,(k,v)| h.merge(k.upcase => v.upcase, k.downcase => v.downcase) }
39
+ end
40
+
41
+ # various_key_value_types({:A => :T}) ==> {:A => :T, 'A' => 'T'}
42
+ def self.various_key_value_types(hash)
43
+ hash.inject({}){|h,(k,v)| h.merge(k.to_s => v.to_s, k.to_sym => v.to_sym) }
44
+ end
45
+
46
+ def self.various_key_value_case_types(hash)
47
+ various_key_value_types(various_key_value_cases(hash))
48
+ end
49
+ end
50
+ end