bioinform 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/TODO.txt +10 -5
- data/bioinform.gemspec +1 -1
- data/lib/bioinform/cli/pcm2pwm.rb +6 -6
- data/lib/bioinform/cli/split_motifs.rb +7 -7
- data/lib/bioinform/data_models.rb +2 -0
- data/lib/bioinform/data_models/collection.rb +85 -1
- data/lib/bioinform/data_models/pcm.rb +8 -5
- data/lib/bioinform/data_models/pm.rb +54 -39
- data/lib/bioinform/data_models/pwm.rb +3 -3
- data/lib/bioinform/parsers/parser.rb +11 -11
- data/lib/bioinform/parsers/string_fantom_parser.rb +23 -2
- data/lib/bioinform/parsers/string_parser.rb +16 -16
- data/lib/bioinform/support/collect_hash.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/cli/data/KLF4 f2 spaced name.pcm +11 -0
- data/spec/cli/data/KLF4_f2.pcm +11 -0
- data/spec/cli/data/KLF4_f2.pwm.result +11 -0
- data/spec/cli/data/SP1_f1.pcm +12 -0
- data/spec/cli/data/SP1_f1.pwm.result +12 -0
- data/spec/cli/pcm2pwm_spec.rb +74 -0
- data/spec/data_models/collection_spec.rb +96 -0
- data/spec/data_models/pcm_spec.rb +5 -5
- data/spec/data_models/pm_spec.rb +136 -30
- data/spec/data_models/ppm_spec.rb +1 -1
- data/spec/data_models/pwm_spec.rb +2 -2
- data/spec/parsers/parser_spec.rb +26 -26
- data/spec/parsers/string_fantom_parser_spec.rb +52 -15
- data/spec/parsers/string_parser_spec.rb +34 -34
- data/spec/spec_helper.rb +32 -1
- data/spec/support/delete_many_spec.rb +2 -2
- metadata +14 -2
data/TODO.txt
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
|
1
|
+
ToDo:
|
2
|
+
how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
|
3
|
+
|
4
|
+
Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
|
5
|
+
Prevent parser going into infinity loop
|
2
6
|
|
3
7
|
Create CLI-apps:
|
4
8
|
-- to merge many files(or whole folder) to a Collection
|
@@ -10,9 +14,10 @@ Decide:
|
|
10
14
|
PPM format such that parser got both matrix and count (if PPM have `word_count`)
|
11
15
|
-- can_parse?
|
12
16
|
-- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
|
17
|
+
-- behaviour of PM#== for PMs with different tags
|
18
|
+
-- should background be in PM by default?
|
19
|
+
-- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
|
20
|
+
-- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
|
13
21
|
|
14
22
|
Specs
|
15
|
-
--
|
16
|
-
-- PWM#probabilities, #score_variance, #gauss_estimation
|
17
|
-
-- pcm2pwm
|
18
|
-
-- split_motifs
|
23
|
+
-- PWM#probabilities, #score_variance, #gauss_estimation
|
data/bioinform.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.summary = %q{Classes for work with different input formats of positional matrices and IUPAC-words and making simple transform and statistics with them. Also module includes several useful extensions for Enumerable module like parametric map and callable symbols }
|
9
9
|
gem.homepage = ""
|
10
10
|
|
11
|
-
gem.files = `git ls-files`.split(
|
11
|
+
gem.files = `git ls-files`.split($/)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
13
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
14
|
gem.name = "bioinform"
|
@@ -13,7 +13,7 @@ When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_fol
|
|
13
13
|
|
14
14
|
Usage:
|
15
15
|
#{__FILE__} [options] [<pcm-files>...]
|
16
|
-
|
16
|
+
|
17
17
|
Options:
|
18
18
|
-h --help Show this screen.
|
19
19
|
-e --extension EXT Extension of output files [default: pwm]
|
@@ -21,27 +21,27 @@ Options:
|
|
21
21
|
DOCOPT
|
22
22
|
|
23
23
|
options = Docopt::docopt(doc, argv: argv)
|
24
|
-
|
24
|
+
|
25
25
|
if options['<pcm-files>'].empty?
|
26
26
|
filelist = $stdin.read.shellsplit
|
27
27
|
else
|
28
28
|
filelist = options['<pcm-files>']
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
folder = options['--folder']
|
32
32
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
33
|
-
|
33
|
+
|
34
34
|
filelist.each do |pcm_filename|
|
35
35
|
pwm = Bioinform::PCM.new( File.read(pcm_filename) ).to_pwm
|
36
36
|
File.open(Bioinform::CLI.output_filename(pcm_filename, options['--extension'], folder), 'w') do |f|
|
37
37
|
f.puts pwm
|
38
38
|
end
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
rescue Docopt::Exit => e
|
42
42
|
puts e.message
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
end
|
46
46
|
end
|
47
47
|
end
|
@@ -4,7 +4,7 @@ require 'docopt'
|
|
4
4
|
module Bioinform
|
5
5
|
module CLI
|
6
6
|
module SplitMotifs
|
7
|
-
|
7
|
+
|
8
8
|
def self.main(argv)
|
9
9
|
doc = <<-DOCOPT
|
10
10
|
Motif splitter.
|
@@ -12,7 +12,7 @@ It get a file with a set of motifs and splits it into motifs according to their
|
|
12
12
|
|
13
13
|
Usage:
|
14
14
|
#{__FILE__} [options] <collection-file>
|
15
|
-
|
15
|
+
|
16
16
|
Options:
|
17
17
|
-h --help Show this screen.
|
18
18
|
-m --data-model MODEL Data model: PM, PCM, PPM or PWM [default: PM]
|
@@ -21,17 +21,17 @@ Options:
|
|
21
21
|
DOCOPT
|
22
22
|
|
23
23
|
options = Docopt::docopt(doc, argv: argv)
|
24
|
-
|
24
|
+
|
25
25
|
folder = options['--folder']
|
26
26
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
27
|
-
|
27
|
+
|
28
28
|
data_model = Bioinform.const_get(options['--data-model'].upcase)
|
29
29
|
extension = options['--extension'] || options['--data-model'].downcase
|
30
|
-
|
30
|
+
|
31
31
|
collection_filename = options['<collection-file>']
|
32
32
|
raise "File #{collection_filename} not exist" unless File.exist? collection_filename
|
33
33
|
input = File.read(collection_filename)
|
34
|
-
|
34
|
+
|
35
35
|
data_model.choose_parser(input).split_on_motifs(input, data_model).each do |motif|
|
36
36
|
File.open(File.join(folder, "#{motif.name}.#{extension}"), 'w') do |f|
|
37
37
|
f.puts motif
|
@@ -40,7 +40,7 @@ Options:
|
|
40
40
|
rescue Docopt::Exit => e
|
41
41
|
puts e.message
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
44
|
end
|
45
45
|
end
|
46
46
|
end
|
@@ -1,2 +1,86 @@
|
|
1
|
-
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
class Collection
|
5
|
+
attr_reader :collection, :parameters
|
6
|
+
|
7
|
+
# collection name is a tag name for each motif in a collection. But motif can be included in several collections so have several tags
|
8
|
+
def initialize(parameters = {})
|
9
|
+
@collection = []
|
10
|
+
@parameters = OpenStruct.new(parameters)
|
11
|
+
yield @parameters if block_given?
|
12
|
+
end
|
13
|
+
|
14
|
+
def size
|
15
|
+
collection.size
|
16
|
+
end
|
17
|
+
|
18
|
+
def name
|
19
|
+
parameters.name
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"<Collection '#{name}'>"
|
24
|
+
end
|
25
|
+
|
26
|
+
def +(other)
|
27
|
+
result = self.class.new
|
28
|
+
each do |pm, infos|
|
29
|
+
result.add_pm(pm, infos)
|
30
|
+
end
|
31
|
+
other.each do |pm, infos|
|
32
|
+
result.add_pm(pm, infos)
|
33
|
+
end
|
34
|
+
result
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_pm(pm, info)
|
38
|
+
pm.mark(self)
|
39
|
+
collection << [pm, info]
|
40
|
+
self
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(pm)
|
44
|
+
add_pm(pm, OpenStruct.new)
|
45
|
+
end
|
46
|
+
|
47
|
+
def each
|
48
|
+
if block_given?
|
49
|
+
collection.each{|pm, infos| yield [pm, infos]}
|
50
|
+
else
|
51
|
+
Enumerator.new(self, :each)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def each_pm
|
56
|
+
if block_given?
|
57
|
+
each{|pm, infos| yield pm}
|
58
|
+
else
|
59
|
+
Enumerator.new(self, :each_pm)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
include Enumerable
|
64
|
+
|
65
|
+
%w[pcm ppm pwm].each do |data_model|
|
66
|
+
method_name = "each_#{data_model}".to_sym #
|
67
|
+
converter_method = "to_#{data_model}".to_sym #
|
68
|
+
define_method method_name do |&block| # define_method :each_pcm do |&block|
|
69
|
+
if block # if block
|
70
|
+
each do |pm, infos| # each do |pm, infos|
|
71
|
+
block.call pm.send(converter_method) # block.call pm.send(:to_pcm)
|
72
|
+
end # end
|
73
|
+
else # else
|
74
|
+
Enumerator.new(self, method_name) # Enumerator.new(self, :each_pcm)
|
75
|
+
end # end
|
76
|
+
end # end
|
77
|
+
end
|
78
|
+
|
79
|
+
def ==(other)
|
80
|
+
(collection == other.collection) && (parameters == other.parameters)
|
81
|
+
rescue
|
82
|
+
false
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
2
86
|
end
|
@@ -6,20 +6,23 @@ module Bioinform
|
|
6
6
|
def count
|
7
7
|
matrix.first.inject(&:+)
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
|
+
def to_pcm
|
11
|
+
self
|
12
|
+
end
|
13
|
+
|
10
14
|
def to_pwm(pseudocount = Math.log(count))
|
11
15
|
mat = each_position.map do |pos|
|
12
16
|
pos.each_index.map do |ind|
|
13
17
|
Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
|
14
18
|
end
|
15
19
|
end
|
16
|
-
PWM.new(matrix: mat, name: name)
|
20
|
+
PWM.new(matrix: mat, name: name, tags: tags, background: background)
|
17
21
|
end
|
18
|
-
|
22
|
+
|
19
23
|
def to_ppm
|
20
24
|
mat = each_position.map{|pos| pos.map{|el| el.to_f / count }}
|
21
|
-
PPM.new(matrix: mat, name: name)
|
25
|
+
PPM.new(matrix: mat, name: name, tags: tags, background: background)
|
22
26
|
end
|
23
|
-
|
24
27
|
end
|
25
28
|
end
|
@@ -4,45 +4,56 @@ require 'bioinform/parsers'
|
|
4
4
|
module Bioinform
|
5
5
|
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3, A: 0, C: 1, G: 2, T: 3}
|
6
6
|
LetterByIndex = {0 => :A, 1 => :C, 2 => :G, 3 => :T}
|
7
|
-
|
7
|
+
|
8
8
|
class PM
|
9
|
-
attr_reader :matrix
|
9
|
+
attr_reader :matrix, :tags
|
10
10
|
attr_accessor :background, :name
|
11
|
-
|
11
|
+
|
12
|
+
def mark(tag)
|
13
|
+
tags << tag
|
14
|
+
end
|
15
|
+
|
16
|
+
def tagged?(tag)
|
17
|
+
tags.any?{|t| (t.eql? tag) || (t.respond_to?(:name) && t.name && (t.name == tag)) }
|
18
|
+
end
|
19
|
+
|
12
20
|
def self.choose_parser(input)
|
13
21
|
[TrivialParser, Parser, StringParser, StringFantomParser].find do |parser|
|
14
22
|
self.new(input, parser) rescue nil
|
15
23
|
end
|
16
24
|
end
|
17
|
-
|
25
|
+
|
18
26
|
def initialize(input, parser = nil)
|
19
27
|
parser ||= self.class.choose_parser(input)
|
20
28
|
raise 'No one parser can process input' unless parser
|
21
29
|
result = parser.new(input).parse
|
22
30
|
@matrix = result[:matrix]
|
23
31
|
@name = result[:name]
|
24
|
-
@
|
32
|
+
@tags = result[:tags] || []
|
33
|
+
@background = result[:background] || [1, 1, 1, 1]
|
25
34
|
raise 'matrix not valid' unless valid?
|
26
35
|
end
|
27
|
-
|
36
|
+
|
28
37
|
def ==(other)
|
29
|
-
@matrix == other.matrix && @background == other.background
|
38
|
+
@matrix == other.matrix && @background == other.background && @name == other.name
|
39
|
+
rescue
|
40
|
+
false
|
30
41
|
end
|
31
|
-
|
42
|
+
|
32
43
|
def self.valid_matrix?(matrix)
|
33
44
|
matrix.is_a?(Array) &&
|
34
45
|
! matrix.empty? &&
|
35
46
|
matrix.all?{|pos| pos.is_a?(Array)} &&
|
36
47
|
matrix.all?{|pos| pos.size == 4} &&
|
37
48
|
matrix.all?{|pos| pos.all?{|el| el.is_a?(Numeric)}}
|
38
|
-
rescue
|
49
|
+
rescue
|
39
50
|
false
|
40
51
|
end
|
41
|
-
|
52
|
+
|
42
53
|
def valid?
|
43
54
|
self.class.valid_matrix?(@matrix)
|
44
55
|
end
|
45
|
-
|
56
|
+
|
46
57
|
def each_position
|
47
58
|
if block_given?
|
48
59
|
matrix.each{|pos| yield pos}
|
@@ -50,12 +61,12 @@ module Bioinform
|
|
50
61
|
Enumerator.new(self, :each_position)
|
51
62
|
end
|
52
63
|
end
|
53
|
-
|
64
|
+
|
54
65
|
def length
|
55
66
|
@matrix.length
|
56
67
|
end
|
57
68
|
alias_method :size, :length
|
58
|
-
|
69
|
+
|
59
70
|
def to_s(options = {})
|
60
71
|
default_options = {with_name: true, letters_as_rows: false}
|
61
72
|
options = default_options.merge(options)
|
@@ -65,41 +76,41 @@ module Bioinform
|
|
65
76
|
else
|
66
77
|
matrix_str = each_position.map{|pos| pos.join("\t")}.join("\n")
|
67
78
|
end
|
68
|
-
|
69
|
-
if options[:with_name] && @name
|
79
|
+
|
80
|
+
if options[:with_name] && @name
|
70
81
|
@name + "\n" + matrix_str
|
71
|
-
else
|
82
|
+
else
|
72
83
|
matrix_str
|
73
84
|
end
|
74
85
|
end
|
75
|
-
|
86
|
+
|
76
87
|
def pretty_string(options = {})
|
77
88
|
default_options = {with_name: true, letters_as_rows: false}
|
78
|
-
|
89
|
+
|
79
90
|
return to_s(options) if options[:letters_as_rows]
|
80
|
-
|
91
|
+
|
81
92
|
options = default_options.merge(options)
|
82
93
|
header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
|
83
94
|
matrix_rows = each_position.map do |position|
|
84
95
|
position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
|
85
96
|
end
|
86
|
-
|
97
|
+
|
87
98
|
matrix_str = matrix_rows.join("\n")
|
88
|
-
|
99
|
+
|
89
100
|
if options[:with_name] && @name
|
90
101
|
@name + "\n" + header + matrix_str
|
91
102
|
else
|
92
103
|
header + matrix_str
|
93
104
|
end
|
94
105
|
end
|
95
|
-
|
106
|
+
|
96
107
|
def to_hash
|
97
|
-
hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
|
108
|
+
hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
|
98
109
|
[ letter, @matrix.map{|pos| pos[letter_index]} ]
|
99
110
|
end
|
100
111
|
hsh.with_indifferent_access
|
101
112
|
end
|
102
|
-
|
113
|
+
|
103
114
|
# pm.background - returns a @background attribute
|
104
115
|
# pm.background(new_background) - sets an attribute and returns pm itself
|
105
116
|
# if more than one argument passed - raises an exception
|
@@ -110,17 +121,17 @@ module Bioinform
|
|
110
121
|
else raise ArgumentError, '#background method can get 0 or 1 argument'
|
111
122
|
end
|
112
123
|
end
|
113
|
-
|
124
|
+
|
114
125
|
def background!(new_background)
|
115
126
|
@background = new_background
|
116
127
|
self
|
117
128
|
end
|
118
|
-
|
129
|
+
|
119
130
|
def self.zero_column
|
120
131
|
[0, 0, 0, 0]
|
121
132
|
end
|
122
133
|
|
123
|
-
def reverse_complement!
|
134
|
+
def reverse_complement!
|
124
135
|
@matrix.reverse!.map!(&:reverse!)
|
125
136
|
self
|
126
137
|
end
|
@@ -132,10 +143,7 @@ module Bioinform
|
|
132
143
|
n.times{ @matrix.push(self.class.zero_column) }
|
133
144
|
self
|
134
145
|
end
|
135
|
-
|
136
|
-
@matrix.map!{|position| min = position.min; position.map{|element| element - min}}
|
137
|
-
self
|
138
|
-
end
|
146
|
+
|
139
147
|
def discrete!(rate)
|
140
148
|
@matrix.map!{|position| position.map{|element| (element * rate).ceil}}
|
141
149
|
self
|
@@ -144,13 +152,12 @@ module Bioinform
|
|
144
152
|
def vocabulary_volume
|
145
153
|
background.inject(&:+) ** length
|
146
154
|
end
|
147
|
-
|
155
|
+
|
148
156
|
def probability
|
149
157
|
sum = background.inject(0.0, &:+)
|
150
158
|
background.map{|element| element.to_f / sum}
|
151
159
|
end
|
152
160
|
|
153
|
-
|
154
161
|
#def split(first_chunk_length)
|
155
162
|
# [@matrix.first(first_chunk_length), matrix.last(length - first_chunk_length)]
|
156
163
|
#end
|
@@ -164,16 +171,16 @@ module Bioinform
|
|
164
171
|
def worst_score
|
165
172
|
@matrix.inject(0.0){|sum, col| sum + col.min}
|
166
173
|
end
|
167
|
-
|
174
|
+
|
168
175
|
# best score of suffix s[i..l]
|
169
176
|
def best_suffix(i)
|
170
177
|
@matrix[i...length].map(&:max).inject(0.0, &:+)
|
171
178
|
end
|
172
|
-
|
179
|
+
|
173
180
|
def worst_suffix(i)
|
174
181
|
@matrix[i...length].map(&:min).inject(0.0, &:+)
|
175
182
|
end
|
176
|
-
|
183
|
+
|
177
184
|
def reverse_complement
|
178
185
|
dup.reverse_complement!
|
179
186
|
end
|
@@ -183,14 +190,22 @@ module Bioinform
|
|
183
190
|
def right_augment(n)
|
184
191
|
dup.right_augment!(n)
|
185
192
|
end
|
186
|
-
def shift_to_zero
|
187
|
-
dup.shift_to_zero!
|
188
|
-
end
|
189
193
|
def discrete(rate)
|
190
194
|
dup.discrete!(rate)
|
191
195
|
end
|
192
196
|
def dup
|
193
197
|
deep_dup
|
194
198
|
end
|
199
|
+
|
200
|
+
def to_pcm
|
201
|
+
PCM.new(matrix: matrix, name: name, tags: tags, background: background)
|
202
|
+
end
|
203
|
+
def to_ppm
|
204
|
+
PPM.new(matrix: matrix, name: name, tags: tags, background: background)
|
205
|
+
end
|
206
|
+
def to_pwm
|
207
|
+
PWM.new(matrix: matrix, name: name, tags: tags, background: background)
|
208
|
+
end
|
209
|
+
|
195
210
|
end
|
196
211
|
end
|