bioinform 0.1.17 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/LICENSE +0 -1
  4. data/README.md +1 -1
  5. data/TODO.txt +23 -30
  6. data/bin/convert_motif +4 -0
  7. data/bin/pcm2pwm +1 -1
  8. data/bin/split_motifs +1 -1
  9. data/bioinform.gemspec +0 -2
  10. data/lib/bioinform.rb +54 -16
  11. data/lib/bioinform/alphabet.rb +85 -0
  12. data/lib/bioinform/background.rb +90 -0
  13. data/lib/bioinform/cli.rb +1 -2
  14. data/lib/bioinform/cli/convert_motif.rb +52 -17
  15. data/lib/bioinform/cli/pcm2pwm.rb +32 -26
  16. data/lib/bioinform/cli/split_motifs.rb +31 -30
  17. data/lib/bioinform/conversion_algorithms.rb +6 -0
  18. data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
  19. data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
  20. data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
  21. data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
  22. data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
  23. data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
  24. data/lib/bioinform/data_models.rb +1 -7
  25. data/lib/bioinform/data_models/named_model.rb +38 -0
  26. data/lib/bioinform/data_models/pcm.rb +18 -28
  27. data/lib/bioinform/data_models/pm.rb +73 -170
  28. data/lib/bioinform/data_models/ppm.rb +11 -24
  29. data/lib/bioinform/data_models/pwm.rb +30 -56
  30. data/lib/bioinform/errors.rb +17 -0
  31. data/lib/bioinform/formatters.rb +4 -2
  32. data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
  33. data/lib/bioinform/formatters/motif_formatter.rb +69 -0
  34. data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
  35. data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
  36. data/lib/bioinform/parsers.rb +1 -8
  37. data/lib/bioinform/parsers/matrix_parser.rb +44 -36
  38. data/lib/bioinform/parsers/motif_splitter.rb +45 -0
  39. data/lib/bioinform/support.rb +46 -14
  40. data/lib/bioinform/support/strip_doc.rb +1 -1
  41. data/lib/bioinform/version.rb +1 -1
  42. data/spec/alphabet_spec.rb +79 -0
  43. data/spec/background_spec.rb +57 -0
  44. data/spec/cli/cli_spec.rb +6 -6
  45. data/spec/cli/convert_motif_spec.rb +88 -88
  46. data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
  47. data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
  48. data/spec/cli/pcm2pwm_spec.rb +22 -23
  49. data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
  50. data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
  51. data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
  52. data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
  53. data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
  54. data/spec/cli/split_motifs_spec.rb +6 -21
  55. data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
  56. data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
  57. data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
  58. data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
  59. data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
  60. data/spec/data_models/named_model_spec.rb +41 -0
  61. data/spec/data_models/pcm_spec.rb +114 -45
  62. data/spec/data_models/pm_spec.rb +132 -333
  63. data/spec/data_models/ppm_spec.rb +47 -44
  64. data/spec/data_models/pwm_spec.rb +85 -77
  65. data/spec/fabricators/motif_formats_fabricator.rb +116 -116
  66. data/spec/formatters/consensus_formatter_spec.rb +26 -0
  67. data/spec/formatters/raw_formatter_spec.rb +169 -0
  68. data/spec/parsers/matrix_parser_spec.rb +216 -0
  69. data/spec/parsers/motif_splitter_spec.rb +87 -0
  70. data/spec/spec_helper.rb +2 -2
  71. data/spec/spec_helper_source.rb +25 -5
  72. data/spec/support_spec.rb +31 -0
  73. metadata +43 -124
  74. data/bin/merge_into_collection +0 -4
  75. data/lib/bioinform/cli/merge_into_collection.rb +0 -80
  76. data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
  77. data/lib/bioinform/data_models/collection.rb +0 -75
  78. data/lib/bioinform/data_models/motif.rb +0 -56
  79. data/lib/bioinform/formatters/raw_formatter.rb +0 -41
  80. data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
  81. data/lib/bioinform/parsers/parser.rb +0 -92
  82. data/lib/bioinform/parsers/splittable_parser.rb +0 -57
  83. data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
  84. data/lib/bioinform/parsers/string_parser.rb +0 -72
  85. data/lib/bioinform/parsers/trivial_parser.rb +0 -34
  86. data/lib/bioinform/parsers/yaml_parser.rb +0 -35
  87. data/lib/bioinform/support/advanced_scan.rb +0 -8
  88. data/lib/bioinform/support/array_product.rb +0 -6
  89. data/lib/bioinform/support/array_zip.rb +0 -6
  90. data/lib/bioinform/support/collect_hash.rb +0 -7
  91. data/lib/bioinform/support/deep_dup.rb +0 -5
  92. data/lib/bioinform/support/delete_many.rb +0 -14
  93. data/lib/bioinform/support/inverf.rb +0 -13
  94. data/lib/bioinform/support/multiline_squish.rb +0 -6
  95. data/lib/bioinform/support/parameters.rb +0 -28
  96. data/lib/bioinform/support/partial_sums.rb +0 -16
  97. data/lib/bioinform/support/same_by.rb +0 -12
  98. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
  99. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
  100. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
  101. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
  102. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
  103. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
  104. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
  105. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
  106. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
  107. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
  108. data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
  109. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
  110. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
  111. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
  112. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
  113. data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
  114. data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
  115. data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
  116. data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
  117. data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
  118. data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
  119. data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
  120. data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
  121. data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
  122. data/spec/cli/data/split_motifs/collection.yaml +0 -188
  123. data/spec/cli/merge_into_collection_spec.rb +0 -100
  124. data/spec/data_models/collection_spec.rb +0 -98
  125. data/spec/data_models/motif_spec.rb +0 -224
  126. data/spec/fabricators/collection_fabricator.rb +0 -8
  127. data/spec/fabricators/motif_fabricator.rb +0 -33
  128. data/spec/fabricators/pcm_fabricator.rb +0 -25
  129. data/spec/fabricators/pm_fabricator.rb +0 -52
  130. data/spec/fabricators/ppm_fabricator.rb +0 -14
  131. data/spec/fabricators/pwm_fabricator.rb +0 -16
  132. data/spec/parsers/parser_spec.rb +0 -152
  133. data/spec/parsers/string_fantom_parser_spec.rb +0 -70
  134. data/spec/parsers/string_parser_spec.rb +0 -77
  135. data/spec/parsers/trivial_parser_spec.rb +0 -64
  136. data/spec/parsers/yaml_parser_spec.rb +0 -50
  137. data/spec/support/advanced_scan_spec.rb +0 -32
  138. data/spec/support/array_product_spec.rb +0 -15
  139. data/spec/support/array_zip_spec.rb +0 -15
  140. data/spec/support/collect_hash_spec.rb +0 -15
  141. data/spec/support/delete_many_spec.rb +0 -44
  142. data/spec/support/inverf_spec.rb +0 -19
  143. data/spec/support/multiline_squish_spec.rb +0 -25
  144. data/spec/support/partial_sums_spec.rb +0 -30
  145. data/spec/support/same_by_spec.rb +0 -36
@@ -0,0 +1,26 @@
1
+ require 'bioinform/data_models/pm'
2
+ require 'bioinform/formatters/consensus_formatter'
3
+
4
+ describe Bioinform::ConsensusFormatter do
5
+ let(:pm) { Bioinform::MotifModel::PM.new([[10,30,10,28], [30,16,16,16], [12,30,10,26], [26,27,27,1]]) }
6
+
7
+ specify('.new without a block raises error') { expect{ Bioinform::ConsensusFormatter.new }.to raise_error Bioinform::Error }
8
+
9
+ context 'custom formatter' do
10
+ let(:formatter){ Bioinform::ConsensusFormatter.new{|pos, el, ind| (pos.max - el) < pos.max * 0.1 } }
11
+ specify{ expect(formatter.format_string(pm)).to eq 'YACV' }
12
+ end
13
+
14
+ context 'standard formatter' do
15
+ let(:formatter){ Bioinform::ConsensusFormatter.by_maximal_elements }
16
+ specify{ expect(formatter.format_string(pm)).to eq 'CACS' }
17
+ end
18
+
19
+ specify do
20
+ expect{|b|
21
+ Bioinform::ConsensusFormatter.new(&b).format_string(pm)
22
+ }.to yield_successive_args( *([ [[10,30,10,28],10,0], # col,el,ind
23
+ [[10,30,10,28],30,1] ] + # col,el,ind
24
+ [Array]*14 ) ) # rest triples
25
+ end
26
+ end
@@ -0,0 +1,169 @@
1
+ require 'bioinform/formatters/motif_formatter'
2
+ require 'bioinform/data_models/pm'
3
+
4
+ describe Bioinform::MotifFormatter do
5
+ let(:matrix) { [[1,2.345,6.7,8.99],
6
+ [10,11.123,-15.678,16]] }
7
+ let(:motif) { Bioinform::MotifModel::PM.new(matrix) }
8
+ let(:default_matrix_string) { "1 2.345 6.7 8.99\n"+
9
+ "10 11.123 -15.678 16" }
10
+
11
+ context 'with default configuration' do
12
+ let(:formatter) { Bioinform::MotifFormatter.new }
13
+ specify { expect(formatter.with_name).to eq :auto }
14
+ specify { expect(formatter.nucleotides_in).to eq :columns }
15
+ specify { expect(formatter.precision).to be_falsy }
16
+ specify { expect(formatter.with_nucleotide_header).to eq false }
17
+ specify { expect(formatter.with_position_header).to eq false }
18
+ end
19
+
20
+ context 'with with_name equal to false' do
21
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: false) }
22
+ specify { expect( formatter.format(motif) ).to eq default_matrix_string }
23
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq default_matrix_string }
24
+ end
25
+ context 'with with_name equal to true' do
26
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: true) }
27
+ specify { expect{ formatter.format(motif) }.to raise_error Bioinform::Error }
28
+ specify { expect( formatter.format(motif.named('')) ).to eq ">\n" +
29
+ default_matrix_string }
30
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq ">Stub name\n" +
31
+ default_matrix_string }
32
+ end
33
+ context 'with with_name equal to :auto' do
34
+ let(:formatter) { Bioinform::MotifFormatter.new(with_name: :auto) }
35
+ specify { expect( formatter.format(motif) ).to eq default_matrix_string }
36
+ specify { expect( formatter.format(motif.named('')) ).to eq default_matrix_string }
37
+ specify { expect( formatter.format(motif.named('Stub name')) ).to eq ">Stub name\n" +
38
+ default_matrix_string }
39
+ end
40
+ context 'with with_name value different from true/false/:auto' do
41
+ specify{ expect { Bioinform::MotifFormatter.new(with_name: :somewhat) }.to raise_error Bioinform::Error }
42
+ end
43
+
44
+ context 'with nucleotides_in :columns' do
45
+ let(:formatter) { Bioinform::MotifFormatter.new(nucleotides_in: :columns) }
46
+ specify { expect( formatter.format(motif) ).to eq "1 2.345 6.7 8.99\n" +
47
+ "10 11.123 -15.678 16" }
48
+ end
49
+ context 'with nucleotides_in :rows' do
50
+ let(:formatter) { Bioinform::MotifFormatter.new(nucleotides_in: :rows) }
51
+ specify { expect( formatter.format(motif) ).to eq "1 10\n" +
52
+ "2.345 11.123\n" +
53
+ "6.7 -15.678\n" +
54
+ "8.99 16" }
55
+ end
56
+ context 'with nucleotides_in not equal to :rows or :columns' do
57
+ specify { expect{ Bioinform::MotifFormatter.new(nucleotides_in: :somewhat) }.to raise_error(Bioinform::Error) }
58
+ end
59
+
60
+ context 'with precision equal to false' do
61
+ let(:formatter) { Bioinform::MotifFormatter.new(precision: false) }
62
+ specify { expect( formatter.format(motif) ).to eq "1 2.345 6.7 8.99\n" +
63
+ "10 11.123 -15.678 16" }
64
+ end
65
+ context 'with precision equal to a number' do
66
+ let(:formatter) { Bioinform::MotifFormatter.new(precision: 3) }
67
+ specify { expect( formatter.format(motif) ).to eq "1 2.35 6.7 8.99\n" +
68
+ "10 11.1 -15.7 16" }
69
+ end
70
+
71
+ context 'with nucleotide header' do
72
+ context 'with nucleotides in columns' do
73
+ let(:formatter) { Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :columns) }
74
+ specify { expect( formatter.format(motif) ).to eq "A C G T\n" +
75
+ "1 2.345 6.7 8.99\n" +
76
+ "10 11.123 -15.678 16" }
77
+
78
+ end
79
+ context 'with nucleotides in rows' do
80
+ let(:formatter) { Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :rows) }
81
+ specify { expect( formatter.format(motif) ).to eq "A 1 10\n" +
82
+ "C 2.345 11.123\n" +
83
+ "G 6.7 -15.678\n" +
84
+ "T 8.99 16" }
85
+ end
86
+ end
87
+
88
+ context 'with position header' do
89
+ context 'with nucleotides in columns' do
90
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, nucleotides_in: :columns) }
91
+ let(:long_motif) { Bioinform::MotifModel::PM.new([[1,2,3,4]] * 12) }
92
+ specify { expect( formatter.format(motif) ).to eq "01 1 2.345 6.7 8.99\n" +
93
+ "02 10 11.123 -15.678 16" }
94
+ specify { expect( formatter.format(long_motif) ).to eq "01 1 2 3 4\n" +
95
+ "02 1 2 3 4\n" +
96
+ "03 1 2 3 4\n" +
97
+ "04 1 2 3 4\n" +
98
+ "05 1 2 3 4\n" +
99
+ "06 1 2 3 4\n" +
100
+ "07 1 2 3 4\n" +
101
+ "08 1 2 3 4\n" +
102
+ "09 1 2 3 4\n" +
103
+ "10 1 2 3 4\n" +
104
+ "11 1 2 3 4\n" +
105
+ "12 1 2 3 4" }
106
+ end
107
+ context 'with nucleotides in rows' do
108
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, nucleotides_in: :rows) }
109
+ specify { expect( formatter.format(motif) ).to eq "01 02\n" +
110
+ "1 10\n" +
111
+ "2.345 11.123\n" +
112
+ "6.7 -15.678\n" +
113
+ "8.99 16" }
114
+ end
115
+ end
116
+
117
+ context 'with both headers' do
118
+ context 'with nucleotides in columns' do
119
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, with_nucleotide_header: true, nucleotides_in: :columns) }
120
+ specify { expect( formatter.format(motif) ).to eq " A C G T\n" +
121
+ "01 1 2.345 6.7 8.99\n" +
122
+ "02 10 11.123 -15.678 16" }
123
+ end
124
+ context 'with nucleotides in rows' do
125
+ let(:formatter) { Bioinform::MotifFormatter.new(with_position_header: true, with_nucleotide_header: true, nucleotides_in: :rows) }
126
+ specify { expect( formatter.format(motif) ).to eq " 01 02\n" +
127
+ "A 1 10\n" +
128
+ "C 2.345 11.123\n" +
129
+ "G 6.7 -15.678\n" +
130
+ "T 8.99 16" }
131
+ end
132
+ end
133
+
134
+ context 'on different alphabet' do
135
+ let(:matrix_15) { [[1,2,3,1.567, 12,-11,12,0,-1.1,0.6, 0.4,0.321,0.11,-1.23, 2.0],
136
+ [0,0,0,0, 0,0,0,0,0,0, 0,0,0,0, 0]] }
137
+ let(:motif) { Bioinform::MotifModel::PM.new(matrix_15, alphabet: Bioinform::IUPACAlphabet) }
138
+
139
+ specify {
140
+ expect( Bioinform::MotifFormatter.new.format(motif) )
141
+ .to eq "1 2 3 1.567 12 -11 12 0 -1.1 0.6 0.4 0.321 0.11 -1.23 2.0\n" +
142
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
143
+ }
144
+ specify {
145
+ expect( Bioinform::MotifFormatter.new(with_nucleotide_header: true).format(motif) )
146
+ .to eq "A C G T M R W S Y K V H D B N\n" +
147
+ "1 2 3 1.567 12 -11 12 0 -1.1 0.6 0.4 0.321 0.11 -1.23 2.0\n" +
148
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
149
+ }
150
+ specify {
151
+ expect( Bioinform::MotifFormatter.new(with_nucleotide_header: true, nucleotides_in: :rows).format(motif) )
152
+ .to eq "A 1 0\n" +
153
+ "C 2 0\n" +
154
+ "G 3 0\n" +
155
+ "T 1.567 0\n" +
156
+ "M 12 0\n" +
157
+ "R -11 0\n" +
158
+ "W 12 0\n" +
159
+ "S 0 0\n" +
160
+ "Y -1.1 0\n" +
161
+ "K 0.6 0\n" +
162
+ "V 0.4 0\n" +
163
+ "H 0.321 0\n" +
164
+ "D 0.11 0\n" +
165
+ "B -1.23 0\n" +
166
+ "N 2.0 0"
167
+ }
168
+ end
169
+ end
@@ -0,0 +1,216 @@
1
+ require 'bioinform/parsers/matrix_parser'
2
+
3
+ describe Bioinform::MatrixParser do
4
+ specify { expect{ Bioinform::MatrixParser.new(nucleotides_in: :somewhat) }.to raise_error Bioinform::Error }
5
+
6
+ context 'with default options' do
7
+ subject(:parser) { Bioinform::MatrixParser.new }
8
+ specify { expect(parser.has_name).to eq :auto }
9
+ specify { expect(parser.has_header_row).to eq false }
10
+ specify { expect(parser.has_header_column).to eq false }
11
+ specify { expect(parser.nucleotides_in).to eq :auto }
12
+ specify { expect(parser.fix_nucleotides_number).to eq 4 }
13
+
14
+ specify { expect(parser.name_pattern).to match ">Motif_name" }
15
+ specify { expect(parser.name_pattern).to match ">Motif name" }
16
+ specify { expect(parser.name_pattern).to match "> Motif name" }
17
+ specify { expect(parser.name_pattern).to match "Motif name" }
18
+ specify { expect(parser.name_pattern).to match "Motif name\tother info" }
19
+
20
+ specify { expect(parser.name_pattern.match(">Motif_name")[:name]).to eq "Motif_name" }
21
+ specify { expect(parser.name_pattern.match(">Motif name")[:name]).to eq "Motif name" }
22
+ specify { expect(parser.name_pattern.match("> Motif name")[:name]).to eq "Motif name" }
23
+ specify { expect(parser.name_pattern.match("Motif name")[:name]).to eq "Motif name" }
24
+ specify { expect(parser.name_pattern.match("Motif name\tother info")[:name]).to eq "Motif name" }
25
+ end
26
+
27
+ context 'parser having name' do
28
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns,has_name: true) }
29
+ let(:input) {">PM name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
30
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
31
+
32
+ specify 'trims empty lines' do
33
+ expect( parser.parse!("\n \t \n" + input + "\n\n") ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} )
34
+ end
35
+ end
36
+ context 'parser having neither name nor header' do
37
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: false) }
38
+ let(:input_allowed) {"1\t2\t3\t4\n" + "11\t12\t13\t14" }
39
+ let(:input_not_allowed) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
40
+ let(:input_not_allowed_2) {"A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
41
+ let(:input_not_allowed_3) {"##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
42
+ specify { expect( parser.parse!(input_allowed) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
43
+ specify { expect{ parser.parse!(input_not_allowed) }.to raise_error Bioinform::Error }
44
+ specify { expect{ parser.parse!(input_not_allowed_2) }.to raise_error Bioinform::Error }
45
+ specify { expect{ parser.parse!(input_not_allowed_3) }.to raise_error Bioinform::Error }
46
+ end
47
+ context 'with has_name equal to :auto parser can either have name or not' do
48
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: :auto) }
49
+ let(:input_without_name) {"1\t2\t3\t4\n" + "11\t12\t13\t14" }
50
+ let(:input_with_name) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
51
+ let(:input_with_bad_name) {"-Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
52
+ specify { expect( parser.parse!(input_without_name) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
53
+ specify { expect( parser.parse!(input_with_name) ).to eq( {name: 'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
54
+ specify { expect{ parser.parse!(input_with_bad_name) }.to raise_error Bioinform::Error }
55
+ end
56
+ context 'parser having name and header row' do
57
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: true, has_header_row: true) }
58
+ let(:input) {">PM name\n" + "A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
59
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
60
+ end
61
+ context 'parser having header row' do
62
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_name: false, has_header_row: true) }
63
+ let(:input) {"A\tC\tG\tT\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
64
+ specify { expect( parser.parse!(input) ).to eq( {name: nil, matrix: [[1,2,3,4],[11,12,13,14]]} ) }
65
+ specify { expect{ parser.parse!("Motif name\n" + input) }.to raise_error Bioinform::Error }
66
+ end
67
+ context 'parser having header column' do
68
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_header_column: true) }
69
+ let(:input) {">PM name\n" + "##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
70
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
71
+ end
72
+ context 'parser having both headers' do
73
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :columns, has_header_row: true, has_header_column: true) }
74
+ let(:input) {">PM name\n" + "X\tA\tC\tG\tT\n" + "##01\t1\t2\t3\t4\n" + "##02\t11\t12\t13\t14" }
75
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
76
+ end
77
+
78
+ context 'parser for transposed matrix' do
79
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows) }
80
+ let(:input) {">PM name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14" }
81
+ specify { expect( parser.parse!(input) ).to eq( {name: 'PM name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
82
+ end
83
+ context 'parser for transposed matrix with row header' do
84
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_row: true) }
85
+ let(:input) {">PM name\n" + "##01\t##02\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14" }
86
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
87
+ end
88
+ context 'parser for transposed matrix with column header' do
89
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_column: true) }
90
+ let(:input) {">PM name\n" + "A\t1\t11\n" + "C\t2\t12\n" + "G\t3\t13\n" + "T\t4\t14" }
91
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
92
+ end
93
+ context 'parser for transposed matrix with both header' do
94
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows, has_header_column: true, has_header_row: true) }
95
+ let(:input) {">PM name\n" + "X\t##01\t##02\n" + "A\t1\t11\n" + "C\t2\t12\n" + "G\t3\t13\n" + "T\t4\t14" }
96
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
97
+ end
98
+
99
+ context 'parser having custom name pattern' do
100
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, name_pattern: /^NA>(?<name>.+)$/) }
101
+ let(:input_allowed) {"NA>Motif name\tother info\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
102
+ let(:input_not_allowed) {"Motif name\tother info\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
103
+ specify { expect( parser.parse!(input_allowed) ).to eq( {name: "Motif name\tother info", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
104
+ specify { expect{ parser.parse!(input_not_allowed) }.to raise_error Bioinform::Error }
105
+ end
106
+
107
+ context 'parser reducing number of nucleotides' do
108
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true) }
109
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
110
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4],[11,12,13,14]]} ) }
111
+ end
112
+ context 'parser for transposed matrix reducing number of nucleotides' do
113
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :rows) }
114
+ let(:input) {">PM name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14\n" + "5\t15"}
115
+ specify { expect( parser.parse!(input) ).to eq( {name: 'PM name', matrix: [[1,2,3,4],[11,12,13,14]]} ) }
116
+ end
117
+ context 'parser not reducing number of nucleotides' do
118
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: false) }
119
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
120
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3,4,5],[11,12,13,14,15]]} ) }
121
+ end
122
+ context 'parser reducing number of nucleotides to a non-standard one' do
123
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: 3) }
124
+ let(:input) {">PM name\n" + "1\t2\t3\t4\t5\n" + "11\t12\t13\t14\t15" }
125
+ specify { expect( parser.parse!(input) ).to eq( {name: "PM name", matrix: [[1,2,3],[11,12,13]]} ) }
126
+ end
127
+ context 'parser which hasn\'t enough number of nucleotides' do
128
+ subject(:parser) { Bioinform::MatrixParser.new(has_name: true, fix_nucleotides_number: 4) }
129
+ let(:input) {">PM name\n" + "1\t2\t3\n" + "11\t12\t13" }
130
+ specify { expect{ parser.parse!(input) }.to raise_error Bioinform::Error }
131
+ end
132
+
133
+ context 'parser with auto transposition' do
134
+ let(:input_not_transposed) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14" }
135
+ let(:input_transposed) {">PM Name\n" + "1\t11\n" + "2\t12\n" + "3\t13\n" + "4\t14"}
136
+ let(:input_4x4) {">PM Name\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14\n" + "1\t2\t3\t4\n" + "11\t12\t13\t14"}
137
+ context 'with fixed nucleotides number' do
138
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :auto, fix_nucleotides_number: 4) }
139
+ specify { expect(parser.parse(input_not_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
140
+ specify { expect(parser.parse(input_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
141
+ specify { expect(parser.parse(input_4x4)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14],[1,2,3,4],[11,12,13,14]]}) }
142
+ end
143
+ context 'with non fixed nucleotides number' do
144
+ subject(:parser) { Bioinform::MatrixParser.new(nucleotides_in: :auto, fix_nucleotides_number: false) }
145
+ specify { expect(parser.parse(input_not_transposed)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14]]}) }
146
+ specify { expect(parser.parse(input_transposed)).to eq({name:'PM Name', matrix: [[1,11],[2,12],[3,13],[4,14]]}) }
147
+ specify { expect(parser.parse(input_4x4)).to eq({name:'PM Name', matrix: [[1,2,3,4],[11,12,13,14],[1,2,3,4],[11,12,13,14]]}) }
148
+ end
149
+ end
150
+
151
+ context 'FANTOM-formatted motifs' do
152
+ let(:parser) do
153
+ Bioinform::MatrixParser.new( has_name: true, name_pattern: /^NA\s+(?<name>.+)$/,
154
+ has_header_row: true, has_header_column: true, nucleotides_in: :columns,
155
+ reduce_to_n_nucleotides: 4 )
156
+ end
157
+
158
+ specify 'parse strings in FANTOM format' do
159
+ input = "NA PM_name\n" +
160
+ "P0 A C G T\n" +
161
+ "P1 1 2 3 4\n" +
162
+ "P2 5 6 7 8"
163
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'})
164
+ end
165
+
166
+
167
+ specify 'ignores additional columns' do
168
+ input = "NA PM_name\n" +
169
+ "P0 A C G T S P\n" +
170
+ "P1 1 2 3 4 5 10\n" +
171
+ "P2 5 6 7 8 5 11"
172
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'})
173
+ end
174
+
175
+ specify 'parses string with more than 10 positions(2-digit row numbers)' do
176
+ input = "NA PM_name\n" +
177
+ "P0 A C G T\n" +
178
+ "P1 1 2 3 4\n" +
179
+ "P2 5 6 7 8\n" +
180
+ "P3 1 2 3 4\n" +
181
+ "P4 5 6 7 8\n" +
182
+ "P5 1 2 3 4\n" +
183
+ "P6 5 6 7 8\n" +
184
+ "P7 1 2 3 4\n" +
185
+ "P8 5 6 7 8\n" +
186
+ "P9 1 2 3 4\n" +
187
+ "P10 5 6 7 8\n" +
188
+ "P11 1 2 3 4\n" +
189
+ "P12 5 6 7 8"
190
+ expect(parser.parse(input)).to eq({matrix: [[1,2,3,4],[5,6,7,8]]*6, name: 'PM_name'})
191
+ end
192
+
193
+ good_cases = {
194
+ 'Nx4 string' => {input: "1 2 3 4\n5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
195
+ '4xN string' => {input: "1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
196
+ 'string with name' => {input: "PM_name\n1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'} },
197
+ 'string with name (with introduction sign)' => {input: ">\t PM_name\n1 5\n2 6\n3 7\n 4 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'PM_name'} },
198
+ 'string with name (with special characters)' => {input: "Testmatrix_first:subname+sub-subname\n1 5\n2 6\n3 7\n 4 8",
199
+ result: {matrix: [[1,2,3,4],[5,6,7,8]], name: 'Testmatrix_first:subname+sub-subname'} },
200
+ 'string with float numerics' => {input: "1.23 4.56 7.8 9.0\n9 -8.7 6.54 -3210", result: {matrix: [[1.23, 4.56, 7.8, 9.0],[9, -8.7, 6.54, -3210]], name: nil} },
201
+ 'string with exponents' => {input: "123e-2 0.456e+1 7.8 9.0\n9 -87000000000E-10 6.54 -3.210e3", result: {matrix: [[1.23, 4.56, 7.8, 9.0],[9, -8.7, 6.54, -3210]], name: nil} },
202
+ 'string with multiple spaces and tabs' => {input: "1 \t\t 2 3 4\n 5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
203
+ 'string with preceeding and terminating newlines' => {input: "\n\n\t 1 2 3 4\n5 6 7 8 \n\t\n", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
204
+ 'string with windows crlf' => {input: "1 2 3 4\r\n5 6 7 8", result: {matrix: [[1,2,3,4],[5,6,7,8]], name: nil} },
205
+ }
206
+
207
+ bad_cases = {
208
+ 'string with non-numeric input' => {input: "1.23 4.56 78aaa 9.0\n9 -8.7 6.54 -3210" },
209
+ 'string with non-numeric input at the end of line' => {input: "1.23 4.56 78 9.0aaa\n9 -8.7 6.54 -3210" },
210
+ 'string with non-numeric input at a separate line' => {input: "1.23 4.56 78 9.0\naaa\n9 -8.7 6.54 -3210" },
211
+ 'string with empty exponent sign' => {input: "1.23 4.56 7.8 9.0\n 9e -8.7 6.54 3210" }
212
+ }
213
+
214
+ parser_specs(Bioinform::MatrixParser.new, good_cases, bad_cases)
215
+ end
216
+ end
@@ -0,0 +1,87 @@
1
+ require 'bioinform/parsers/motif_splitter'
2
+
3
+ describe Bioinform::MotifSplitter do
4
+ let(:motif_unnamed) { "1 2 3 4\n"+"5\t6\t7\t8" }
5
+ let(:motif_with_floats) { "motif2\n 1.0 1.1 1.2 1.3\n 14 15 16 17\n 19 20 21 22" }
6
+ let(:motif_with_signs_and_exponents) { "> motif3\n-2.0 1.3e-3 -5.47 5.2\n+3.4 7 3 3" }
7
+
8
+ context 'default splitter\n' do
9
+ let(:motif_splitter) { Bioinform::MotifSplitter.new }
10
+
11
+ specify do
12
+ input = motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents
13
+ expect(motif_splitter.split(input)).to eq [motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
14
+ end
15
+
16
+ specify do
17
+ input = motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents + "\n"
18
+ expect(motif_splitter.split(input)).to eq [motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
19
+ end
20
+
21
+ specify do
22
+ input = "Motif1 name\n" + motif_unnamed + "\n" + motif_with_floats + "\n" + motif_with_signs_and_exponents
23
+ expect(motif_splitter.split(input)).to eq ["Motif1 name\n" + motif_unnamed, motif_with_floats, motif_with_signs_and_exponents]
24
+ end
25
+
26
+ specify { expect(motif_splitter.split(motif_unnamed + "\n\n" + motif_unnamed)).to eq [motif_unnamed, motif_unnamed] }
27
+
28
+ specify { expect(motif_splitter.split(motif_unnamed + "\n\n\n" + motif_unnamed)).to eq [motif_unnamed, motif_unnamed] }
29
+ end
30
+
31
+ context 'with specified pattern' do
32
+ let(:motif_splitter) { Bioinform::MotifSplitter.new(start_motif_pattern: /^NA\s+\w+$/, splitter_pattern: /^\/\/\s$/) }
33
+
34
+ let(:input_1) {
35
+ "NA motif_1\n" +
36
+ "P0 A C G T\n" +
37
+ "P1 0 1 2 3\n" +
38
+ "P2 4 5 6 7"
39
+ }
40
+
41
+ let(:input_2) {
42
+ "NA motif_2\n" +
43
+ "P0 A C G T\n" +
44
+ "P1 1 2 3 4\n" +
45
+ "P2 5 6 7 8\n" +
46
+ "P3 9 10 11 12"
47
+ }
48
+
49
+ let(:input_3) {
50
+ "NA motif_3\n" +
51
+ "P0 A C G T\n" +
52
+ "P1 2 3 4 5\n" +
53
+ "P2 6 7 8 9"
54
+ }
55
+
56
+ # this input doesn't have pattern of start motif
57
+ let(:input_wo_name) {
58
+ "P0 A C G T\n" +
59
+ "P1 3 4 5 6\n" +
60
+ "P2 7 8 9 10"
61
+ }
62
+
63
+ specify do
64
+ input = "//\n" +
65
+ input_1 + "\n" +
66
+ "//\n" +
67
+ "//\n" +
68
+ input_2 + "\n" +
69
+ "//\n" +
70
+ input_3
71
+ expect(motif_splitter.split(input)).to eq [input_1, input_2, input_3]
72
+ end
73
+
74
+ specify 'splitter (w/o motif starter) is enough to split motifs' do
75
+ input = input_1 + "\n" +
76
+ "//\n" +
77
+ input_wo_name
78
+ expect(motif_splitter.split(input)).to eq [input_1, input_wo_name]
79
+ end
80
+
81
+ specify 'motif starter (w/o splitter) is enough to split motifs' do
82
+ input = input_1 + "\n" +
83
+ input_2
84
+ expect(motif_splitter.split(input)).to eq [input_1, input_2]
85
+ end
86
+ end
87
+ end