bioinform 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/TODO.txt +10 -5
- data/bioinform.gemspec +1 -1
- data/lib/bioinform/cli/pcm2pwm.rb +6 -6
- data/lib/bioinform/cli/split_motifs.rb +7 -7
- data/lib/bioinform/data_models.rb +2 -0
- data/lib/bioinform/data_models/collection.rb +85 -1
- data/lib/bioinform/data_models/pcm.rb +8 -5
- data/lib/bioinform/data_models/pm.rb +54 -39
- data/lib/bioinform/data_models/pwm.rb +3 -3
- data/lib/bioinform/parsers/parser.rb +11 -11
- data/lib/bioinform/parsers/string_fantom_parser.rb +23 -2
- data/lib/bioinform/parsers/string_parser.rb +16 -16
- data/lib/bioinform/support/collect_hash.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/cli/data/KLF4 f2 spaced name.pcm +11 -0
- data/spec/cli/data/KLF4_f2.pcm +11 -0
- data/spec/cli/data/KLF4_f2.pwm.result +11 -0
- data/spec/cli/data/SP1_f1.pcm +12 -0
- data/spec/cli/data/SP1_f1.pwm.result +12 -0
- data/spec/cli/pcm2pwm_spec.rb +74 -0
- data/spec/data_models/collection_spec.rb +96 -0
- data/spec/data_models/pcm_spec.rb +5 -5
- data/spec/data_models/pm_spec.rb +136 -30
- data/spec/data_models/ppm_spec.rb +1 -1
- data/spec/data_models/pwm_spec.rb +2 -2
- data/spec/parsers/parser_spec.rb +26 -26
- data/spec/parsers/string_fantom_parser_spec.rb +52 -15
- data/spec/parsers/string_parser_spec.rb +34 -34
- data/spec/spec_helper.rb +32 -1
- data/spec/support/delete_many_spec.rb +2 -2
- metadata +14 -2
data/TODO.txt
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
|
1
|
+
ToDo:
|
2
|
+
how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
|
3
|
+
|
4
|
+
Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
|
5
|
+
Prevent parser going into infinity loop
|
2
6
|
|
3
7
|
Create CLI-apps:
|
4
8
|
-- to merge many files(or whole folder) to a Collection
|
@@ -10,9 +14,10 @@ Decide:
|
|
10
14
|
PPM format such that parser got both matrix and count (if PPM have `word_count`)
|
11
15
|
-- can_parse?
|
12
16
|
-- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
|
17
|
+
-- behaviour of PM#== for PMs with different tags
|
18
|
+
-- should background be in PM by default?
|
19
|
+
-- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
|
20
|
+
-- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
|
13
21
|
|
14
22
|
Specs
|
15
|
-
--
|
16
|
-
-- PWM#probabilities, #score_variance, #gauss_estimation
|
17
|
-
-- pcm2pwm
|
18
|
-
-- split_motifs
|
23
|
+
-- PWM#probabilities, #score_variance, #gauss_estimation
|
data/bioinform.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.summary = %q{Classes for work with different input formats of positional matrices and IUPAC-words and making simple transform and statistics with them. Also module includes several useful extensions for Enumerable module like parametric map and callable symbols }
|
9
9
|
gem.homepage = ""
|
10
10
|
|
11
|
-
gem.files = `git ls-files`.split(
|
11
|
+
gem.files = `git ls-files`.split($/)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
13
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
14
|
gem.name = "bioinform"
|
@@ -13,7 +13,7 @@ When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_fol
|
|
13
13
|
|
14
14
|
Usage:
|
15
15
|
#{__FILE__} [options] [<pcm-files>...]
|
16
|
-
|
16
|
+
|
17
17
|
Options:
|
18
18
|
-h --help Show this screen.
|
19
19
|
-e --extension EXT Extension of output files [default: pwm]
|
@@ -21,27 +21,27 @@ Options:
|
|
21
21
|
DOCOPT
|
22
22
|
|
23
23
|
options = Docopt::docopt(doc, argv: argv)
|
24
|
-
|
24
|
+
|
25
25
|
if options['<pcm-files>'].empty?
|
26
26
|
filelist = $stdin.read.shellsplit
|
27
27
|
else
|
28
28
|
filelist = options['<pcm-files>']
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
folder = options['--folder']
|
32
32
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
33
|
-
|
33
|
+
|
34
34
|
filelist.each do |pcm_filename|
|
35
35
|
pwm = Bioinform::PCM.new( File.read(pcm_filename) ).to_pwm
|
36
36
|
File.open(Bioinform::CLI.output_filename(pcm_filename, options['--extension'], folder), 'w') do |f|
|
37
37
|
f.puts pwm
|
38
38
|
end
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
rescue Docopt::Exit => e
|
42
42
|
puts e.message
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
end
|
46
46
|
end
|
47
47
|
end
|
@@ -4,7 +4,7 @@ require 'docopt'
|
|
4
4
|
module Bioinform
|
5
5
|
module CLI
|
6
6
|
module SplitMotifs
|
7
|
-
|
7
|
+
|
8
8
|
def self.main(argv)
|
9
9
|
doc = <<-DOCOPT
|
10
10
|
Motif splitter.
|
@@ -12,7 +12,7 @@ It get a file with a set of motifs and splits it into motifs according to their
|
|
12
12
|
|
13
13
|
Usage:
|
14
14
|
#{__FILE__} [options] <collection-file>
|
15
|
-
|
15
|
+
|
16
16
|
Options:
|
17
17
|
-h --help Show this screen.
|
18
18
|
-m --data-model MODEL Data model: PM, PCM, PPM or PWM [default: PM]
|
@@ -21,17 +21,17 @@ Options:
|
|
21
21
|
DOCOPT
|
22
22
|
|
23
23
|
options = Docopt::docopt(doc, argv: argv)
|
24
|
-
|
24
|
+
|
25
25
|
folder = options['--folder']
|
26
26
|
Dir.mkdir(folder) unless Dir.exist?(folder)
|
27
|
-
|
27
|
+
|
28
28
|
data_model = Bioinform.const_get(options['--data-model'].upcase)
|
29
29
|
extension = options['--extension'] || options['--data-model'].downcase
|
30
|
-
|
30
|
+
|
31
31
|
collection_filename = options['<collection-file>']
|
32
32
|
raise "File #{collection_filename} not exist" unless File.exist? collection_filename
|
33
33
|
input = File.read(collection_filename)
|
34
|
-
|
34
|
+
|
35
35
|
data_model.choose_parser(input).split_on_motifs(input, data_model).each do |motif|
|
36
36
|
File.open(File.join(folder, "#{motif.name}.#{extension}"), 'w') do |f|
|
37
37
|
f.puts motif
|
@@ -40,7 +40,7 @@ Options:
|
|
40
40
|
rescue Docopt::Exit => e
|
41
41
|
puts e.message
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
44
|
end
|
45
45
|
end
|
46
46
|
end
|
@@ -1,2 +1,86 @@
|
|
1
|
-
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module Bioinform
|
4
|
+
class Collection
|
5
|
+
attr_reader :collection, :parameters
|
6
|
+
|
7
|
+
# collection name is a tag name for each motif in a collection. But motif can be included in several collections so have several tags
|
8
|
+
def initialize(parameters = {})
|
9
|
+
@collection = []
|
10
|
+
@parameters = OpenStruct.new(parameters)
|
11
|
+
yield @parameters if block_given?
|
12
|
+
end
|
13
|
+
|
14
|
+
def size
|
15
|
+
collection.size
|
16
|
+
end
|
17
|
+
|
18
|
+
def name
|
19
|
+
parameters.name
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"<Collection '#{name}'>"
|
24
|
+
end
|
25
|
+
|
26
|
+
def +(other)
|
27
|
+
result = self.class.new
|
28
|
+
each do |pm, infos|
|
29
|
+
result.add_pm(pm, infos)
|
30
|
+
end
|
31
|
+
other.each do |pm, infos|
|
32
|
+
result.add_pm(pm, infos)
|
33
|
+
end
|
34
|
+
result
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_pm(pm, info)
|
38
|
+
pm.mark(self)
|
39
|
+
collection << [pm, info]
|
40
|
+
self
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(pm)
|
44
|
+
add_pm(pm, OpenStruct.new)
|
45
|
+
end
|
46
|
+
|
47
|
+
def each
|
48
|
+
if block_given?
|
49
|
+
collection.each{|pm, infos| yield [pm, infos]}
|
50
|
+
else
|
51
|
+
Enumerator.new(self, :each)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def each_pm
|
56
|
+
if block_given?
|
57
|
+
each{|pm, infos| yield pm}
|
58
|
+
else
|
59
|
+
Enumerator.new(self, :each_pm)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
include Enumerable
|
64
|
+
|
65
|
+
%w[pcm ppm pwm].each do |data_model|
|
66
|
+
method_name = "each_#{data_model}".to_sym #
|
67
|
+
converter_method = "to_#{data_model}".to_sym #
|
68
|
+
define_method method_name do |&block| # define_method :each_pcm do |&block|
|
69
|
+
if block # if block
|
70
|
+
each do |pm, infos| # each do |pm, infos|
|
71
|
+
block.call pm.send(converter_method) # block.call pm.send(:to_pcm)
|
72
|
+
end # end
|
73
|
+
else # else
|
74
|
+
Enumerator.new(self, method_name) # Enumerator.new(self, :each_pcm)
|
75
|
+
end # end
|
76
|
+
end # end
|
77
|
+
end
|
78
|
+
|
79
|
+
def ==(other)
|
80
|
+
(collection == other.collection) && (parameters == other.parameters)
|
81
|
+
rescue
|
82
|
+
false
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
2
86
|
end
|
@@ -6,20 +6,23 @@ module Bioinform
|
|
6
6
|
def count
|
7
7
|
matrix.first.inject(&:+)
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
|
+
def to_pcm
|
11
|
+
self
|
12
|
+
end
|
13
|
+
|
10
14
|
def to_pwm(pseudocount = Math.log(count))
|
11
15
|
mat = each_position.map do |pos|
|
12
16
|
pos.each_index.map do |ind|
|
13
17
|
Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
|
14
18
|
end
|
15
19
|
end
|
16
|
-
PWM.new(matrix: mat, name: name)
|
20
|
+
PWM.new(matrix: mat, name: name, tags: tags, background: background)
|
17
21
|
end
|
18
|
-
|
22
|
+
|
19
23
|
def to_ppm
|
20
24
|
mat = each_position.map{|pos| pos.map{|el| el.to_f / count }}
|
21
|
-
PPM.new(matrix: mat, name: name)
|
25
|
+
PPM.new(matrix: mat, name: name, tags: tags, background: background)
|
22
26
|
end
|
23
|
-
|
24
27
|
end
|
25
28
|
end
|
@@ -4,45 +4,56 @@ require 'bioinform/parsers'
|
|
4
4
|
module Bioinform
|
5
5
|
IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3, A: 0, C: 1, G: 2, T: 3}
|
6
6
|
LetterByIndex = {0 => :A, 1 => :C, 2 => :G, 3 => :T}
|
7
|
-
|
7
|
+
|
8
8
|
class PM
|
9
|
-
attr_reader :matrix
|
9
|
+
attr_reader :matrix, :tags
|
10
10
|
attr_accessor :background, :name
|
11
|
-
|
11
|
+
|
12
|
+
def mark(tag)
|
13
|
+
tags << tag
|
14
|
+
end
|
15
|
+
|
16
|
+
def tagged?(tag)
|
17
|
+
tags.any?{|t| (t.eql? tag) || (t.respond_to?(:name) && t.name && (t.name == tag)) }
|
18
|
+
end
|
19
|
+
|
12
20
|
def self.choose_parser(input)
|
13
21
|
[TrivialParser, Parser, StringParser, StringFantomParser].find do |parser|
|
14
22
|
self.new(input, parser) rescue nil
|
15
23
|
end
|
16
24
|
end
|
17
|
-
|
25
|
+
|
18
26
|
def initialize(input, parser = nil)
|
19
27
|
parser ||= self.class.choose_parser(input)
|
20
28
|
raise 'No one parser can process input' unless parser
|
21
29
|
result = parser.new(input).parse
|
22
30
|
@matrix = result[:matrix]
|
23
31
|
@name = result[:name]
|
24
|
-
@
|
32
|
+
@tags = result[:tags] || []
|
33
|
+
@background = result[:background] || [1, 1, 1, 1]
|
25
34
|
raise 'matrix not valid' unless valid?
|
26
35
|
end
|
27
|
-
|
36
|
+
|
28
37
|
def ==(other)
|
29
|
-
@matrix == other.matrix && @background == other.background
|
38
|
+
@matrix == other.matrix && @background == other.background && @name == other.name
|
39
|
+
rescue
|
40
|
+
false
|
30
41
|
end
|
31
|
-
|
42
|
+
|
32
43
|
def self.valid_matrix?(matrix)
|
33
44
|
matrix.is_a?(Array) &&
|
34
45
|
! matrix.empty? &&
|
35
46
|
matrix.all?{|pos| pos.is_a?(Array)} &&
|
36
47
|
matrix.all?{|pos| pos.size == 4} &&
|
37
48
|
matrix.all?{|pos| pos.all?{|el| el.is_a?(Numeric)}}
|
38
|
-
rescue
|
49
|
+
rescue
|
39
50
|
false
|
40
51
|
end
|
41
|
-
|
52
|
+
|
42
53
|
def valid?
|
43
54
|
self.class.valid_matrix?(@matrix)
|
44
55
|
end
|
45
|
-
|
56
|
+
|
46
57
|
def each_position
|
47
58
|
if block_given?
|
48
59
|
matrix.each{|pos| yield pos}
|
@@ -50,12 +61,12 @@ module Bioinform
|
|
50
61
|
Enumerator.new(self, :each_position)
|
51
62
|
end
|
52
63
|
end
|
53
|
-
|
64
|
+
|
54
65
|
def length
|
55
66
|
@matrix.length
|
56
67
|
end
|
57
68
|
alias_method :size, :length
|
58
|
-
|
69
|
+
|
59
70
|
def to_s(options = {})
|
60
71
|
default_options = {with_name: true, letters_as_rows: false}
|
61
72
|
options = default_options.merge(options)
|
@@ -65,41 +76,41 @@ module Bioinform
|
|
65
76
|
else
|
66
77
|
matrix_str = each_position.map{|pos| pos.join("\t")}.join("\n")
|
67
78
|
end
|
68
|
-
|
69
|
-
if options[:with_name] && @name
|
79
|
+
|
80
|
+
if options[:with_name] && @name
|
70
81
|
@name + "\n" + matrix_str
|
71
|
-
else
|
82
|
+
else
|
72
83
|
matrix_str
|
73
84
|
end
|
74
85
|
end
|
75
|
-
|
86
|
+
|
76
87
|
def pretty_string(options = {})
|
77
88
|
default_options = {with_name: true, letters_as_rows: false}
|
78
|
-
|
89
|
+
|
79
90
|
return to_s(options) if options[:letters_as_rows]
|
80
|
-
|
91
|
+
|
81
92
|
options = default_options.merge(options)
|
82
93
|
header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
|
83
94
|
matrix_rows = each_position.map do |position|
|
84
95
|
position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
|
85
96
|
end
|
86
|
-
|
97
|
+
|
87
98
|
matrix_str = matrix_rows.join("\n")
|
88
|
-
|
99
|
+
|
89
100
|
if options[:with_name] && @name
|
90
101
|
@name + "\n" + header + matrix_str
|
91
102
|
else
|
92
103
|
header + matrix_str
|
93
104
|
end
|
94
105
|
end
|
95
|
-
|
106
|
+
|
96
107
|
def to_hash
|
97
|
-
hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
|
108
|
+
hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
|
98
109
|
[ letter, @matrix.map{|pos| pos[letter_index]} ]
|
99
110
|
end
|
100
111
|
hsh.with_indifferent_access
|
101
112
|
end
|
102
|
-
|
113
|
+
|
103
114
|
# pm.background - returns a @background attribute
|
104
115
|
# pm.background(new_background) - sets an attribute and returns pm itself
|
105
116
|
# if more than one argument passed - raises an exception
|
@@ -110,17 +121,17 @@ module Bioinform
|
|
110
121
|
else raise ArgumentError, '#background method can get 0 or 1 argument'
|
111
122
|
end
|
112
123
|
end
|
113
|
-
|
124
|
+
|
114
125
|
def background!(new_background)
|
115
126
|
@background = new_background
|
116
127
|
self
|
117
128
|
end
|
118
|
-
|
129
|
+
|
119
130
|
def self.zero_column
|
120
131
|
[0, 0, 0, 0]
|
121
132
|
end
|
122
133
|
|
123
|
-
def reverse_complement!
|
134
|
+
def reverse_complement!
|
124
135
|
@matrix.reverse!.map!(&:reverse!)
|
125
136
|
self
|
126
137
|
end
|
@@ -132,10 +143,7 @@ module Bioinform
|
|
132
143
|
n.times{ @matrix.push(self.class.zero_column) }
|
133
144
|
self
|
134
145
|
end
|
135
|
-
|
136
|
-
@matrix.map!{|position| min = position.min; position.map{|element| element - min}}
|
137
|
-
self
|
138
|
-
end
|
146
|
+
|
139
147
|
def discrete!(rate)
|
140
148
|
@matrix.map!{|position| position.map{|element| (element * rate).ceil}}
|
141
149
|
self
|
@@ -144,13 +152,12 @@ module Bioinform
|
|
144
152
|
def vocabulary_volume
|
145
153
|
background.inject(&:+) ** length
|
146
154
|
end
|
147
|
-
|
155
|
+
|
148
156
|
def probability
|
149
157
|
sum = background.inject(0.0, &:+)
|
150
158
|
background.map{|element| element.to_f / sum}
|
151
159
|
end
|
152
160
|
|
153
|
-
|
154
161
|
#def split(first_chunk_length)
|
155
162
|
# [@matrix.first(first_chunk_length), matrix.last(length - first_chunk_length)]
|
156
163
|
#end
|
@@ -164,16 +171,16 @@ module Bioinform
|
|
164
171
|
def worst_score
|
165
172
|
@matrix.inject(0.0){|sum, col| sum + col.min}
|
166
173
|
end
|
167
|
-
|
174
|
+
|
168
175
|
# best score of suffix s[i..l]
|
169
176
|
def best_suffix(i)
|
170
177
|
@matrix[i...length].map(&:max).inject(0.0, &:+)
|
171
178
|
end
|
172
|
-
|
179
|
+
|
173
180
|
def worst_suffix(i)
|
174
181
|
@matrix[i...length].map(&:min).inject(0.0, &:+)
|
175
182
|
end
|
176
|
-
|
183
|
+
|
177
184
|
def reverse_complement
|
178
185
|
dup.reverse_complement!
|
179
186
|
end
|
@@ -183,14 +190,22 @@ module Bioinform
|
|
183
190
|
def right_augment(n)
|
184
191
|
dup.right_augment!(n)
|
185
192
|
end
|
186
|
-
def shift_to_zero
|
187
|
-
dup.shift_to_zero!
|
188
|
-
end
|
189
193
|
def discrete(rate)
|
190
194
|
dup.discrete!(rate)
|
191
195
|
end
|
192
196
|
def dup
|
193
197
|
deep_dup
|
194
198
|
end
|
199
|
+
|
200
|
+
def to_pcm
|
201
|
+
PCM.new(matrix: matrix, name: name, tags: tags, background: background)
|
202
|
+
end
|
203
|
+
def to_ppm
|
204
|
+
PPM.new(matrix: matrix, name: name, tags: tags, background: background)
|
205
|
+
end
|
206
|
+
def to_pwm
|
207
|
+
PWM.new(matrix: matrix, name: name, tags: tags, background: background)
|
208
|
+
end
|
209
|
+
|
195
210
|
end
|
196
211
|
end
|