bioinform 0.1.17 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/LICENSE +0 -1
  4. data/README.md +1 -1
  5. data/TODO.txt +23 -30
  6. data/bin/convert_motif +4 -0
  7. data/bin/pcm2pwm +1 -1
  8. data/bin/split_motifs +1 -1
  9. data/bioinform.gemspec +0 -2
  10. data/lib/bioinform.rb +54 -16
  11. data/lib/bioinform/alphabet.rb +85 -0
  12. data/lib/bioinform/background.rb +90 -0
  13. data/lib/bioinform/cli.rb +1 -2
  14. data/lib/bioinform/cli/convert_motif.rb +52 -17
  15. data/lib/bioinform/cli/pcm2pwm.rb +32 -26
  16. data/lib/bioinform/cli/split_motifs.rb +31 -30
  17. data/lib/bioinform/conversion_algorithms.rb +6 -0
  18. data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
  19. data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
  20. data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
  21. data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
  22. data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
  23. data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
  24. data/lib/bioinform/data_models.rb +1 -7
  25. data/lib/bioinform/data_models/named_model.rb +38 -0
  26. data/lib/bioinform/data_models/pcm.rb +18 -28
  27. data/lib/bioinform/data_models/pm.rb +73 -170
  28. data/lib/bioinform/data_models/ppm.rb +11 -24
  29. data/lib/bioinform/data_models/pwm.rb +30 -56
  30. data/lib/bioinform/errors.rb +17 -0
  31. data/lib/bioinform/formatters.rb +4 -2
  32. data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
  33. data/lib/bioinform/formatters/motif_formatter.rb +69 -0
  34. data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
  35. data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
  36. data/lib/bioinform/parsers.rb +1 -8
  37. data/lib/bioinform/parsers/matrix_parser.rb +44 -36
  38. data/lib/bioinform/parsers/motif_splitter.rb +45 -0
  39. data/lib/bioinform/support.rb +46 -14
  40. data/lib/bioinform/support/strip_doc.rb +1 -1
  41. data/lib/bioinform/version.rb +1 -1
  42. data/spec/alphabet_spec.rb +79 -0
  43. data/spec/background_spec.rb +57 -0
  44. data/spec/cli/cli_spec.rb +6 -6
  45. data/spec/cli/convert_motif_spec.rb +88 -88
  46. data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
  47. data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
  48. data/spec/cli/pcm2pwm_spec.rb +22 -23
  49. data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
  50. data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
  51. data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
  52. data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
  53. data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
  54. data/spec/cli/split_motifs_spec.rb +6 -21
  55. data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
  56. data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
  57. data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
  58. data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
  59. data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
  60. data/spec/data_models/named_model_spec.rb +41 -0
  61. data/spec/data_models/pcm_spec.rb +114 -45
  62. data/spec/data_models/pm_spec.rb +132 -333
  63. data/spec/data_models/ppm_spec.rb +47 -44
  64. data/spec/data_models/pwm_spec.rb +85 -77
  65. data/spec/fabricators/motif_formats_fabricator.rb +116 -116
  66. data/spec/formatters/consensus_formatter_spec.rb +26 -0
  67. data/spec/formatters/raw_formatter_spec.rb +169 -0
  68. data/spec/parsers/matrix_parser_spec.rb +216 -0
  69. data/spec/parsers/motif_splitter_spec.rb +87 -0
  70. data/spec/spec_helper.rb +2 -2
  71. data/spec/spec_helper_source.rb +25 -5
  72. data/spec/support_spec.rb +31 -0
  73. metadata +43 -124
  74. data/bin/merge_into_collection +0 -4
  75. data/lib/bioinform/cli/merge_into_collection.rb +0 -80
  76. data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
  77. data/lib/bioinform/data_models/collection.rb +0 -75
  78. data/lib/bioinform/data_models/motif.rb +0 -56
  79. data/lib/bioinform/formatters/raw_formatter.rb +0 -41
  80. data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
  81. data/lib/bioinform/parsers/parser.rb +0 -92
  82. data/lib/bioinform/parsers/splittable_parser.rb +0 -57
  83. data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
  84. data/lib/bioinform/parsers/string_parser.rb +0 -72
  85. data/lib/bioinform/parsers/trivial_parser.rb +0 -34
  86. data/lib/bioinform/parsers/yaml_parser.rb +0 -35
  87. data/lib/bioinform/support/advanced_scan.rb +0 -8
  88. data/lib/bioinform/support/array_product.rb +0 -6
  89. data/lib/bioinform/support/array_zip.rb +0 -6
  90. data/lib/bioinform/support/collect_hash.rb +0 -7
  91. data/lib/bioinform/support/deep_dup.rb +0 -5
  92. data/lib/bioinform/support/delete_many.rb +0 -14
  93. data/lib/bioinform/support/inverf.rb +0 -13
  94. data/lib/bioinform/support/multiline_squish.rb +0 -6
  95. data/lib/bioinform/support/parameters.rb +0 -28
  96. data/lib/bioinform/support/partial_sums.rb +0 -16
  97. data/lib/bioinform/support/same_by.rb +0 -12
  98. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
  99. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
  100. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
  101. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
  102. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
  103. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
  104. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
  105. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
  106. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
  107. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
  108. data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
  109. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
  110. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
  111. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
  112. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
  113. data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
  114. data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
  115. data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
  116. data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
  117. data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
  118. data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
  119. data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
  120. data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
  121. data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
  122. data/spec/cli/data/split_motifs/collection.yaml +0 -188
  123. data/spec/cli/merge_into_collection_spec.rb +0 -100
  124. data/spec/data_models/collection_spec.rb +0 -98
  125. data/spec/data_models/motif_spec.rb +0 -224
  126. data/spec/fabricators/collection_fabricator.rb +0 -8
  127. data/spec/fabricators/motif_fabricator.rb +0 -33
  128. data/spec/fabricators/pcm_fabricator.rb +0 -25
  129. data/spec/fabricators/pm_fabricator.rb +0 -52
  130. data/spec/fabricators/ppm_fabricator.rb +0 -14
  131. data/spec/fabricators/pwm_fabricator.rb +0 -16
  132. data/spec/parsers/parser_spec.rb +0 -152
  133. data/spec/parsers/string_fantom_parser_spec.rb +0 -70
  134. data/spec/parsers/string_parser_spec.rb +0 -77
  135. data/spec/parsers/trivial_parser_spec.rb +0 -64
  136. data/spec/parsers/yaml_parser_spec.rb +0 -50
  137. data/spec/support/advanced_scan_spec.rb +0 -32
  138. data/spec/support/array_product_spec.rb +0 -15
  139. data/spec/support/array_zip_spec.rb +0 -15
  140. data/spec/support/collect_hash_spec.rb +0 -15
  141. data/spec/support/delete_many_spec.rb +0 -44
  142. data/spec/support/inverf_spec.rb +0 -19
  143. data/spec/support/multiline_squish_spec.rb +0 -25
  144. data/spec/support/partial_sums_spec.rb +0 -30
  145. data/spec/support/same_by_spec.rb +0 -36
@@ -0,0 +1,26 @@
1
+ require 'bioinform/data_models/pm'
2
+ require 'bioinform/formatters/consensus_formatter'
3
+
4
+ describe Bioinform::ConsensusFormatter do
5
+ let(:pm) { Bioinform::MotifModel::PM.new([[10,30,10,28], [30,16,16,16], [12,30,10,26], [26,27,27,1]]) }
6
+
7
+ specify('.new without a block raises error') { expect{ Bioinform::ConsensusFormatter.new }.to raise_error Bioinform::Error }
8
+
9
+ context 'custom formatter' do
10
+ let(:formatter){ Bioinform::ConsensusFormatter.new{|pos, el, ind| (pos.max - el) < pos.max * 0.1 } }
11
+ specify{ expect(formatter.format_string(pm)).to eq 'YACV' }
12
+ end
13
+
14
+ context 'standard formatter' do
15
+ let(:formatter){ Bioinform::ConsensusFormatter.by_maximal_elements }
16
+ specify{ expect(formatter.format_string(pm)).to eq 'CACS' }
17
+ end
18
+
19
+ specify do
20
+ expect{|b|
21
+ Bioinform::ConsensusFormatter.new(&b).format_string(pm)
22
+ }.to yield_successive_args( *([ [[10,30,10,28],10,0], # col,el,ind
23
+ [[10,30,10,28],30,1] ] + # col,el,ind
24
+ [Array]*14 ) ) # rest triples
25
+ end
26
+ end
@@ -0,0 +1,169 @@
1
+ require 'bioinform/formatters/motif_formatter'
2
+ require 'bioinform/data_models/pm'
3
+
4
+ describe Bioinform::MotifFormatter do
5
+ let(:matrix) { [[1,2.345,6.7,8.99],
6
+ [10,11.123,-15.678,16]] }
7
+ let(:motif) { Bioinform::MotifModel::PM.new(matrix) }
8
+ let(:default_matrix_string) { "1 2.345 6.7 8.99\n"+
9
+ "10 11.123 -15.678 16" }
10
+
11
+ context 'with default configuration' do
12
+ let(:formatter) { Bioinform::MotifFormatter.new }
13
+ specify { expect(formatter.with_name).to eq :auto }
14
+ specify { expect(formatter.nucleotides_in).to eq :columns }
15
+ specify { expect(formatter.precision).to be_falsy }
16
+ specify { expect(formatter.with_nucleotide_header).to eq false }
17
+ specify { expect(formatter.with_position_header).to eq false }
18
+ end
19
+
20
+ context 'with with_name equal to false' do
21
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: false) }
22
+ specify { expect( formatter.format(motif) ).to eq default_matrix_string }
23
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq default_matrix_string }
24
+ end
25
+ context 'with with_name equal to true' do
26
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: true) }
27
+ specify { expect{ formatter.format(motif) }.to raise_error Bioinform::Error }
28
+ specify { expect( formatter.format(motif.named('')) ).to eq ">\n" +
29
+ default_matrix_string }
30
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq ">Stub name\n" +
31
+ default_matrix_string }
32
+ end
33
+ context 'with with_name equal to :auto' do
34
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: :auto) }
35
+ specify { expect( formatter.format(motif) ).to eq default_matrix_string }
36
+ specify { expect( formatter.format(motif.named('')) ).to eq default_matrix_string }
37
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq ">Stub name\n" +
38
+ default_matrix_string }
39
+ end
40
+ context 'with with_name value different from true/false/:auto' do
41
+ specify{ expect { Bioinform::MotifFormatter.new(with_name: :somewhat) }.to raise_error Bioinform::Error }
42
+ end
43
+
44
+ context 'with nucleotides_in :columns' do
45
+ let(:formatter) { Bioinform::MotifFormatter.new(nucleotides_in: :columns) }
46
+ specify { expect( formatter.format(motif) ).to eq "1 2.345 6.7 8.99\n" +
47
+ "10 11.123 -15.678 16" }
48
+ end
49
+ context 'with nucleotides_in :rows' do
50
+ let(:formatter) { Bioinform::MotifFormatter.new(nucleotides_in: :rows) }
51
+ specify { expect( formatter.format(motif) ).to eq "1 10\n" +
52
+ "2.345 11.123\n" +
53
+ "6.7 -15.678\n" +
54
+ "8.99 16" }
55
+ end
56
+ context 'with nucleotides_in not equal to :rows or :columns' do
57
+ specify { expect{ Bioinform::MotifFormatter.new(nucleotides_in: :somewhat) }.to raise_error(Bioinform::Error) }
58
+ end
59
+
60
+ context 'with precision equal to false' do
61
+ let(:formatter) { Bioinform::MotifFormatter.new(precision: false) }
62
+ specify { expect( formatter.format(motif) ).to eq "1 2.345 6.7 8.99\n" +
63
+ "10 11.123 -15.678 16" }
64
+ end
65
+ context 'with precision equal to a number' do
66
+ let(:formatter) { Bioinform::MotifFormatter.new(precision: 3) }
67
+ specify { expect( formatter.format(motif) ).to eq "1 2.35 6.7 8.99\n" +
68
+ "10 11.1 -15.7 16" }
69
+ end
70
+
71
+ context 'with nucleotide header' do
72
+ context 'with nucleotides in columns' do
73
+ let(:formatter) { Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :columns) }
74
+ specify { expect( formatter.format(motif) ).to eq "A C G T\n" +
75
+ "1 2.345 6.7 8.99\n" +
76
+ "10 11.123 -15.678 16" }
77
+
78
+ end
79
+ context 'with nucleotides in rows' do
80
+ let(:formatter) { Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :rows) }
81
+ specify { expect( formatter.format(motif) ).to eq "A 1 10\n" +
82
+ "C 2.345 11.123\n" +
83
+ "G 6.7 -15.678\n" +
84
+ "T 8.99 16" }
85
+ end
86
+ end
87
+
88
+ context 'with position header' do
89
+ context 'with nucleotides in columns' do
90
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, nucleotides_in: :columns) }
91
+ let(:long_motif) { Bioinform::MotifModel::PM.new([[1,2,3,4]] * 12) }
92
+ specify { expect( formatter.format(motif) ).to eq "01 1 2.345 6.7 8.99\n" +
93
+ "02 10 11.123 -15.678 16" }
94
+ specify { expect( formatter.format(long_motif) ).to eq "01 1 2 3 4\n" +
95
+ "02 1 2 3 4\n" +
96
+ "03 1 2 3 4\n" +
97
+ "04 1 2 3 4\n" +
98
+ "05 1 2 3 4\n" +
99
+ "06 1 2 3 4\n" +
100
+ "07 1 2 3 4\n" +
101
+ "08 1 2 3 4\n" +
102
+ "09 1 2 3 4\n" +
103
+ "10 1 2 3 4\n" +
104
+ "11 1 2 3 4\n" +
105
+ "12 1 2 3 4" }
106
+ end
107
+ context 'with nucleotides in rows' do
108
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, nucleotides_in: :rows) }
109
+ specify { expect( formatter.format(motif) ).to eq "01 02\n" +
110
+ "1 10\n" +
111
+ "2.345 11.123\n" +
112
+ "6.7 -15.678\n" +
113
+ "8.99 16" }
114
+ end
115
+ end
116
+
117
+ context 'with both headers' do
118
+ context 'with nucleotides in columns' do
119
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, with_nucleotide_header: true, nucleotides_in: :columns) }
120
+ specify { expect( formatter.format(motif) ).to eq " A C G T\n" +
121
+ "01 1 2.345 6.7 8.99\n" +
122
+ "02 10 11.123 -15.678 16" }
123
+ end
124
+ context 'with nucleotides in rows' do
125
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, with_nucleotide_header: true, nucleotides_in: :rows) }
126
+ specify { expect( formatter.format(motif) ).to eq " 01 02\n" +
127
+ "A 1 10\n" +
128
+ "C 2.345 11.123\n" +
129
+ "G 6.7 -15.678\n" +
130
+ "T 8.99 16" }
131
+ end
132
+ end
133
+
134
+ context 'on different alphabet' do
135
+ let(:matrix_15) { [[1,2,3,1.567, 12,-11,12,0,-1.1,0.6, 0.4,0.321,0.11,-1.23, 2.0],
136
+ [0,0,0,0, 0,0,0,0,0,0, 0,0,0,0, 0]] }
137
+ let(:motif) { Bioinform::MotifModel::PM.new(matrix_15, alphabet: Bioinform::IUPACAlphabet) }
138
+
139
+ specify {
140
+ expect( Bioinform::MotifFormatter.new.format(motif) )
141
+ .to eq "1 2 3 1.567 12 -11 12 0 -1.1 0.6 0.4 0.321 0.11 -1.23 2.0\n" +
142
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
143
+ }
144
+ specify {
145
+ expect( Bioinform::MotifFormatter.new(with_nucleotide_header: true).format(motif) )
146
+ .to eq "A C G T M R W S Y K V H D B N\n" +
147
+ "1 2 3 1.567 12 -11 12 0 -1.1 0.6 0.4 0.321 0.11 -1.23 2.0\n" +
148
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
149
+ }
150
+ specify {
151
+ expect( Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :rows).format(motif) )
152
+ .to eq "A 1 0\n" +
153
+ "C 2 0\n" +
154
+ "G 3 0\n" +
155
+ "T 1.567 0\n" +
156
+ "M 12 0\n" +
157
+ "R -11 0\n" +
158
+ "W 12 0\n" +
159
+ "S 0 0\n" +
160
+ "Y -1.1 0\n" +
161
+ "K 0.6 0\n" +
162
+ "V 0.4 0\n" +
163
+ "H 0.321 0\n" +
164
+ "D 0.11 0\n" +
165
+ "B -1.23 0\n" +
166
+ "N 2.0 0"
167
+ }
168
+ end
169
+ end
@@ -0,0 +1,216 @@
1
+ require 'bioinform/parsers/matrix_parser'
2
+
3
+ describe Bioinform::MatrixParser do
4
+ specify { expect{ Bioinform::MatrixParser.new(nucleotides_in: :somewhat) }.to raise_error Bioinform::Error }
5
+
6
+ context 'with default options' do
7
+ subject(:parser) { Bioinform::MatrixParser.new }
8
+ specify { expect(parser.has_name).to eq :auto }
9
+ specify { expect(parser.has_header_row).to eq false }
10
+ specify { expect(parser.has_header_column).to eq false }
11
+ specify { expect(parser.nucleotides_in).to eq :auto }
12
+ specify { expect(parser.fix_nucleotides_number).to eq 4 }
13
+
14
+ specify { expect(parser.name_pattern).to match ">Motif_name" }
15
+ specify { expect(parser.name_pattern).to match ">Motif name" }
16
+ specify { expect(parser.name_pattern).to match "> Motif name" }
17
+ specify { expect(parser.name_pattern).to match "Motif name" }
18
+ specify { expect(parser.name_pattern).to match "Motif name\tother info" }
19
+
20
+ specify { expect(parser.name_pattern.match(">Motif_name")[:name]).to eq "Motif_name" }
21
+ specify { expect(parser.name_pattern.match(">Motif name")[:name]).to eq "Motif name" }
22
+ specify { expect(parser.name_pattern.match("> Motif name")[:name]).to eq "Motif name" }
23
+ specify { expect(parser.name_pattern.match("Motif name")[:name]).to eq "Motif name" }
24
+ specify { expect(parser.name_pattern.match("Motif name\tother info")[:name]).to eq "Motif name" }
25
+ end
26
+
27
+ context 'parser having name' do
28
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns,has_name: true) }
29
+ let(:input) {">PM name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
30
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
31
+
32
+ specify 'trims empty lines' do
33
+ expect( parser.parse!("\n \t \n" + input + "\n\n") ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} )
34
+ end
35
+ end
36
+ context 'parser having neither name nor header' do
37
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: false) }
38
+ let(:input_allowed) {"1\t2\t3\t4\n" + "11\t12\t13\t14" }
39
+ let(:input_not_allowed) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
40
+ let(:input_not_allowed_2) {"A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
41
+ let(:input_not_allowed_3) {"##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
42
+ specify { expect( parser.parse!(input_allowed) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
43
+ specify { expect{ parser.parse!(input_not_allowed) }.to raise_error Bioinform::Error }
44
+ specify { expect{ parser.parse!(input_not_allowed_2) }.to raise_error Bioinform::Error }
45
+ specify { expect{ parser.parse!(input_not_allowed_3) }.to raise_error Bioinform::Error }
46
+ end
47
+ context 'with has_name equal to :auto parser can either have name or not' do
48
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: :auto) }
49
+ let(:input_without_name) {"1\t2\t3\t4\n" + "11\t12\t13\t14" }
50
+ let(:input_with_name) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
51
+ let(:input_with_bad_name) {"-Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
52
+ specify { expect( parser.parse!(input_without_name) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
53
+ specify { expect( parser.parse!(input_with_name) ).to eq( {name: 'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
54
+ specify { expect{ parser.parse!(input_with_bad_name) }.to raise_error Bioinform::Error }
55
+ end
56
+ context 'parser having name and header row' do
57
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: true, has_header_row: true) }
58
+ let(:input) {">PM name\n" + "A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
59
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
60
+ end
61
+ context 'parser having header row' do
62
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: false, has_header_row: true) }
63
+ let(:input) {"A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
64
+ specify { expect( parser.parse!(input) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
65
+ specify { expect{ parser.parse!("Motif name\n" + input) }.to raise_error Bioinform::Error }
66
+ end
67
+ context 'parser having header column' do
68
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_header_column: true) }
69
+ let(:input) {">PM name\n" + "##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
70
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
71
+ end
72
+ context 'parser having both headers' do
73
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_header_row: true, has_header_column: true) }
74
+ let(:input) {">PM name\n" + "X\tA\tC\tG\tT\n" + "##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
75
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
76
+ end
77
+
78
+ context 'parser for transposed matrix' do
79
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows) }
80
+ let(:input) {">PM name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14" }
81
+ specify { expect( parser.parse!(input) ).to eq( {name: 'PM name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
82
+ end
83
+ context 'parser for transposed matrix with row header' do
84
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_row: true) }
85
+ let(:input) {">PM name\n" + "##01\t##02\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14" }
86
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
87
+ end
88
+ context 'parser for transposed matrix with column header' do
89
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_column: true) }
90
+ let(:input) {">PM name\n" + "A\t1\t11\n" + "C\t2\t12\n" + "G\t3\t13\n" + "T\t4\t14" }
91
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
92
+ end
93
+ context 'parser for transposed matrix with both header' do
94
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_column: true, has_header_row: true) }
95
+ let(:input) {">PM name\n" + "X\t##01\t##02\n" + "A\t1\t11\n" + "C\t2\t12\n" + "G\t3\t13\n" + "T\t4\t14" }
96
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
97
+ end
98
+
99
+ context 'parser having custom name pattern' do
100
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, name_pattern: /^NA>(?<name>.+)$/) }
101
+ let(:input_allowed) {"NA>Motif name\tother info\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
102
+ let(:input_not_allowed) {"Motif name\tother info\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
103
+ specify { expect( parser.parse!(input_allowed) ).to eq( {name: "Motif name\tother info", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
104
+ specify { expect{ parser.parse!(input_not_allowed) }.to raise_error Bioinform::Error }
105
+ end
106
+
107
+ context 'parser reducing number of nucleotides' do
108
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true) }
109
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
110
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
111
+ end
112
+ context 'parser for transposed matrix reducing number of nucleotides' do
113
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows) }
114
+ let(:input) {">PM name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14\n" + "5\t15"}
115
+ specify { expect( parser.parse!(input) ).to eq( {name: 'PM name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
116
+ end
117
+ context 'parser not reducing number of nucleotides' do
118
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: false) }
119
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
120
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4,5],[11,12,13,14,15]]} ) }
121
+ end
122
+ context 'parser reducing number of nucleotides to a non-standard one' do
123
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: 3) }
124
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
125
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3],[11,12,13]]} ) }
126
+ end
127
+ context 'parser which hasn\'t enough number of nucleotides' do
128
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: 4) }
129
+ let(:input) {">PM name\n" + "1\t2\t3\n" + "11\t12\t13" }
130
+ specify { expect{ parser.parse!(input) }.to raise_error Bioinform::Error }
131
+ end
132
+
133
+ context 'parser with auto transposition' do
134
+ let(:input_not_transposed) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
135
+ let(:input_transposed) {">PM Name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14"}
136
+ let(:input_4x4) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14"}
137
+ context 'with fixed nucleotides number' do
138
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :auto, fix_nucleotides_number: 4) }
139
+ specify { expect(parser.parse(input_not_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
140
+ specify { expect(parser.parse(input_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
141
+ specify { expect(parser.parse(input_4x4)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14],[1,2,3,4],[11,12,13,14]]}) }
142
+ end
143
+ context 'with non fixed nucleotides number' do
144
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :auto, fix_nucleotides_number: false) }
145
+ specify { expect(parser.parse(input_not_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
146
+ specify { expect(parser.parse(input_transposed)).to eq({name:'PM Name', matrix: [[1,11],[2,12],[3,13],[4,14]]}) }
147
+ specify { expect(parser.parse(input_4x4)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14],[1,2,3,4],[11,12,13,14]]}) }
148
+ end
149
+ end
150
+
151
+ context 'FANTOM-formatted motifs' do
152
+ let(:parser) do
153
+ Bioinform::MatrixParser.new( has_name: true, name_pattern: /^NA\s+(?<name>.+)$/,
154
+ has_header_row: true, has_header_column: true, nucleotides_in: :columns,
155
+ reduce_to_n_nucleotides: 4 )
156
+ end
157
+
158
+ specify 'parse strings in FANTOM format' do
159
+ input = "NA PM_name\n" +
160
+ "P0 A C G T\n" +
161
+ "P1 1 2 3 4\n" +
162
+ "P2 5 6 7 8"
163
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'})
164
+ end
165
+
166
+
167
+ specify 'ignores additional columns' do
168
+ input = "NA PM_name\n" +
169
+ "P0 A C G T S P\n" +
170
+ "P1 1 2 3 4 5 10\n" +
171
+ "P2 5 6 7 8 5 11"
172
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'})
173
+ end
174
+
175
+ specify 'parses string with more than 10 positions(2-digit row numbers)' do
176
+ input = "NA PM_name\n" +
177
+ "P0 A C G T\n" +
178
+ "P1 1 2 3 4\n" +
179
+ "P2 5 6 7 8\n" +
180
+ "P3 1 2 3 4\n" +
181
+ "P4 5 6 7 8\n" +
182
+ "P5 1 2 3 4\n" +
183
+ "P6 5 6 7 8\n" +
184
+ "P7 1 2 3 4\n" +
185
+ "P8 5 6 7 8\n" +
186
+ "P9 1 2 3 4\n" +
187
+ "P10 5 6 7 8\n" +
188
+ "P11 1 2 3 4\n" +
189
+ "P12 5 6 7 8"
190
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]]*6, name: 'PM_name'})
191
+ end
192
+
193
+ good_cases = {
194
+ 'Nx4 string' => {input: "1 2 3 4\n5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
195
+ '4xN string' => {input: "1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
196
+ 'string with name' => {input: "PM_name\n1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'} },
197
+ 'string with name (with introduction sign)' => {input: ">\t PM_name\n1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'} },
198
+ 'string with name (with special characters)' => {input: "Testmatrix_first:subname+sub-subname\n1 5\n2 6\n3 7\n 4 8",
199
+ result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'Testmatrix_first:subname+sub-subname'} },
200
+ 'string with float numerics' => {input: "1.23 4.56 7.8 9.0\n9 -8.7 6.54 -3210", result: {matrix: [[1.23, 4.56, 7.8, 9.0],[9, -8.7, 6.54, -3210]], name: nil} },
201
+ 'string with exponents' => {input: "123e-2 0.456e+1 7.8 9.0\n9 -87000000000E-10 6.54 -3.210e3", result: {matrix: [[1.23, 4.56, 7.8, 9.0],[9, -8.7, 6.54, -3210]], name: nil} },
202
+ 'string with multiple spaces and tabs' => {input: "1 \t\t 2 3 4\n 5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
203
+ 'string with preceeding and terminating newlines' => {input: "\n\n\t 1 2 3 4\n5 6 7 8 \n\t\n", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
204
+ 'string with windows crlf' => {input: "1 2 3 4\r\n5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
205
+ }
206
+
207
+ bad_cases = {
208
+ 'string with non-numeric input' => {input: "1.23 4.56 78aaa 9.0\n9 -8.7 6.54 -3210" },
209
+ 'string with non-numeric input at the end of line' => {input: "1.23 4.56 78 9.0aaa\n9 -8.7 6.54 -3210" },
210
+ 'string with non-numeric input at a separate line' => {input: "1.23 4.56 78 9.0\naaa\n9 -8.7 6.54 -3210" },
211
+ 'string with empty exponent sign' => {input: "1.23 4.56 7.8 9.0\n 9e -8.7 6.54 3210" }
212
+ }
213
+
214
+ parser_specs(Bioinform::MatrixParser.new, good_cases, bad_cases)
215
+ end
216
+ end
@@ -0,0 +1,87 @@
1
+ require 'bioinform/parsers/motif_splitter'
2
+
3
+ describe Bioinform::MotifSplitter do
4
+ let(:motif_unnamed) { "1 2 3 4\n"+"5\t6\t7\t8" }
5
+ let(:motif_with_floats) { "motif2\n 1.0 1.1 1.2 1.3\n 14 15 16 17\n 19 20 21 22" }
6
+ let(:motif_with_signs_and_exponents) { "> motif3\n-2.0 1.3e-3 -5.47 5.2\n+3.4 7 3 3" }
7
+
8
+ context 'default splitter\n' do
9
+ let(:motif_splitter) { Bioinform::MotifSplitter.new }
10
+
11
+ specify do
12
+ input = motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents
13
+ expect(motif_splitter.split(input)).to eq [motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
14
+ end
15
+
16
+ specify do
17
+ input = motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents + "\n"
18
+ expect(motif_splitter.split(input)).to eq [motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
19
+ end
20
+
21
+ specify do
22
+ input = "Motif1 name\n" + motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents
23
+ expect(motif_splitter.split(input)).to eq ["Motif1 name\n" + motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
24
+ end
25
+
26
+ specify { expect(motif_splitter.split(motif_unnamed + "\n\n" + motif_unnamed)).to eq [motif_unnamed, motif_unnamed] }
27
+
28
+ specify { expect(motif_splitter.split(motif_unnamed + "\n\n\n" + motif_unnamed)).to eq [motif_unnamed, motif_unnamed] }
29
+ end
30
+
31
+ context 'with specified pattern' do
32
+ let(:motif_splitter) { Bioinform::MotifSplitter.new(start_motif_pattern: /^NA\s+\w+$/, splitter_pattern: /^\/\/\s$/) }
33
+
34
+ let(:input_1) {
35
+ "NA motif_1\n" +
36
+ "P0 A C G T\n" +
37
+ "P1 0 1 2 3\n" +
38
+ "P2 4 5 6 7"
39
+ }
40
+
41
+ let(:input_2) {
42
+ "NA motif_2\n" +
43
+ "P0 A C G T\n" +
44
+ "P1 1 2 3 4\n" +
45
+ "P2 5 6 7 8\n" +
46
+ "P3 9 10 11 12"
47
+ }
48
+
49
+ let(:input_3) {
50
+ "NA motif_3\n" +
51
+ "P0 A C G T\n" +
52
+ "P1 2 3 4 5\n" +
53
+ "P2 6 7 8 9"
54
+ }
55
+
56
+ # this input doesn't have pattern of start motif
57
+ let(:input_wo_name) {
58
+ "P0 A C G T\n" +
59
+ "P1 3 4 5 6\n" +
60
+ "P2 7 8 9 10"
61
+ }
62
+
63
+ specify do
64
+ input = "//\n" +
65
+ input_1 + "\n" +
66
+ "//\n" +
67
+ "//\n" +
68
+ input_2 + "\n" +
69
+ "//\n" +
70
+ input_3
71
+ expect(motif_splitter.split(input)).to eq [input_1, input_2, input_3]
72
+ end
73
+
74
+ specify 'splitter (w/o motif starter) is enough to split motifs' do
75
+ input = input_1 + "\n" +
76
+ "//\n" +
77
+ input_wo_name
78
+ expect(motif_splitter.split(input)).to eq [input_1, input_wo_name]
79
+ end
80
+
81
+ specify 'motif starter (w/o splitter) is enough to split motifs' do
82
+ input = input_1 + "\n" +
83
+ input_2
84
+ expect(motif_splitter.split(input)).to eq [input_1, input_2]
85
+ end
86
+ end
87
+ end