bioinform 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/TODO.txt CHANGED
@@ -1,4 +1,8 @@
1
- Create Collection class. Rewrite MACRO-APE to use this class.
1
+ ToDo:
2
+ how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
3
+
4
+ Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
5
+ Prevent parser going into infinity loop
2
6
 
3
7
  Create CLI-apps:
4
8
  -- to merge many files(or whole folder) to a Collection
@@ -10,9 +14,10 @@ Decide:
10
14
  PPM format such that parser got both matrix and count (if PPM have `word_count`)
11
15
  -- can_parse?
12
16
  -- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
17
+ -- behaviour of PM#== for PMs with different tags
18
+ -- should background be in PM by default?
19
+ -- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
20
+ -- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
13
21
 
14
22
  Specs
15
- -- PM#== (!)
16
- -- PWM#probabilities, #score_variance, #gauss_estimation
17
- -- pcm2pwm
18
- -- split_motifs
23
+ -- PWM#probabilities, #score_variance, #gauss_estimation
data/bioinform.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.summary = %q{Classes for work with different input formats of positional matrices and IUPAC-words and making simple transform and statistics with them. Also module includes several useful extensions for Enumerable module like parametric map and callable symbols }
9
9
  gem.homepage = ""
10
10
 
11
- gem.files = `git ls-files`.split($\)
11
+ gem.files = `git ls-files`.split($/)
12
12
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
13
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
14
  gem.name = "bioinform"
@@ -13,7 +13,7 @@ When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_fol
13
13
 
14
14
  Usage:
15
15
  #{__FILE__} [options] [<pcm-files>...]
16
-
16
+
17
17
  Options:
18
18
  -h --help Show this screen.
19
19
  -e --extension EXT Extension of output files [default: pwm]
@@ -21,27 +21,27 @@ Options:
21
21
  DOCOPT
22
22
 
23
23
  options = Docopt::docopt(doc, argv: argv)
24
-
24
+
25
25
  if options['<pcm-files>'].empty?
26
26
  filelist = $stdin.read.shellsplit
27
27
  else
28
28
  filelist = options['<pcm-files>']
29
29
  end
30
-
30
+
31
31
  folder = options['--folder']
32
32
  Dir.mkdir(folder) unless Dir.exist?(folder)
33
-
33
+
34
34
  filelist.each do |pcm_filename|
35
35
  pwm = Bioinform::PCM.new( File.read(pcm_filename) ).to_pwm
36
36
  File.open(Bioinform::CLI.output_filename(pcm_filename, options['--extension'], folder), 'w') do |f|
37
37
  f.puts pwm
38
38
  end
39
39
  end
40
-
40
+
41
41
  rescue Docopt::Exit => e
42
42
  puts e.message
43
43
  end
44
-
44
+
45
45
  end
46
46
  end
47
47
  end
@@ -4,7 +4,7 @@ require 'docopt'
4
4
  module Bioinform
5
5
  module CLI
6
6
  module SplitMotifs
7
-
7
+
8
8
  def self.main(argv)
9
9
  doc = <<-DOCOPT
10
10
  Motif splitter.
@@ -12,7 +12,7 @@ It get a file with a set of motifs and splits it into motifs according to their
12
12
 
13
13
  Usage:
14
14
  #{__FILE__} [options] <collection-file>
15
-
15
+
16
16
  Options:
17
17
  -h --help Show this screen.
18
18
  -m --data-model MODEL Data model: PM, PCM, PPM or PWM [default: PM]
@@ -21,17 +21,17 @@ Options:
21
21
  DOCOPT
22
22
 
23
23
  options = Docopt::docopt(doc, argv: argv)
24
-
24
+
25
25
  folder = options['--folder']
26
26
  Dir.mkdir(folder) unless Dir.exist?(folder)
27
-
27
+
28
28
  data_model = Bioinform.const_get(options['--data-model'].upcase)
29
29
  extension = options['--extension'] || options['--data-model'].downcase
30
-
30
+
31
31
  collection_filename = options['<collection-file>']
32
32
  raise "File #{collection_filename} not exist" unless File.exist? collection_filename
33
33
  input = File.read(collection_filename)
34
-
34
+
35
35
  data_model.choose_parser(input).split_on_motifs(input, data_model).each do |motif|
36
36
  File.open(File.join(folder, "#{motif.name}.#{extension}"), 'w') do |f|
37
37
  f.puts motif
@@ -40,7 +40,7 @@ Options:
40
40
  rescue Docopt::Exit => e
41
41
  puts e.message
42
42
  end
43
-
43
+
44
44
  end
45
45
  end
46
46
  end
@@ -5,5 +5,7 @@ require 'bioinform/data_models/pcm'
5
5
  require 'bioinform/data_models/ppm'
6
6
  require 'bioinform/data_models/pwm'
7
7
 
8
+ require 'bioinform/data_models/collection'
9
+
8
10
  #require 'bioinform/data_models/iupac_word'
9
11
  #require 'bioinform/data_models/iupac_wordset'
@@ -1,2 +1,86 @@
1
- class PMCollection
1
+ require 'ostruct'
2
+
3
+ module Bioinform
4
+ class Collection
5
+ attr_reader :collection, :parameters
6
+
7
+ # collection name is a tag name for each motif in a collection. But motif can be included in several collections so have several tags
8
+ def initialize(parameters = {})
9
+ @collection = []
10
+ @parameters = OpenStruct.new(parameters)
11
+ yield @parameters if block_given?
12
+ end
13
+
14
+ def size
15
+ collection.size
16
+ end
17
+
18
+ def name
19
+ parameters.name
20
+ end
21
+
22
+ def to_s
23
+ "<Collection '#{name}'>"
24
+ end
25
+
26
+ def +(other)
27
+ result = self.class.new
28
+ each do |pm, infos|
29
+ result.add_pm(pm, infos)
30
+ end
31
+ other.each do |pm, infos|
32
+ result.add_pm(pm, infos)
33
+ end
34
+ result
35
+ end
36
+
37
+ def add_pm(pm, info)
38
+ pm.mark(self)
39
+ collection << [pm, info]
40
+ self
41
+ end
42
+
43
+ def <<(pm)
44
+ add_pm(pm, OpenStruct.new)
45
+ end
46
+
47
+ def each
48
+ if block_given?
49
+ collection.each{|pm, infos| yield [pm, infos]}
50
+ else
51
+ Enumerator.new(self, :each)
52
+ end
53
+ end
54
+
55
+ def each_pm
56
+ if block_given?
57
+ each{|pm, infos| yield pm}
58
+ else
59
+ Enumerator.new(self, :each_pm)
60
+ end
61
+ end
62
+
63
+ include Enumerable
64
+
65
+ %w[pcm ppm pwm].each do |data_model|
66
+ method_name = "each_#{data_model}".to_sym #
67
+ converter_method = "to_#{data_model}".to_sym #
68
+ define_method method_name do |&block| # define_method :each_pcm do |&block|
69
+ if block # if block
70
+ each do |pm, infos| # each do |pm, infos|
71
+ block.call pm.send(converter_method) # block.call pm.send(:to_pcm)
72
+ end # end
73
+ else # else
74
+ Enumerator.new(self, method_name) # Enumerator.new(self, :each_pcm)
75
+ end # end
76
+ end # end
77
+ end
78
+
79
+ def ==(other)
80
+ (collection == other.collection) && (parameters == other.parameters)
81
+ rescue
82
+ false
83
+ end
84
+
85
+ end
2
86
  end
@@ -6,20 +6,23 @@ module Bioinform
6
6
  def count
7
7
  matrix.first.inject(&:+)
8
8
  end
9
-
9
+
10
+ def to_pcm
11
+ self
12
+ end
13
+
10
14
  def to_pwm(pseudocount = Math.log(count))
11
15
  mat = each_position.map do |pos|
12
16
  pos.each_index.map do |ind|
13
17
  Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
14
18
  end
15
19
  end
16
- PWM.new(matrix: mat, name: name)
20
+ PWM.new(matrix: mat, name: name, tags: tags, background: background)
17
21
  end
18
-
22
+
19
23
  def to_ppm
20
24
  mat = each_position.map{|pos| pos.map{|el| el.to_f / count }}
21
- PPM.new(matrix: mat, name: name)
25
+ PPM.new(matrix: mat, name: name, tags: tags, background: background)
22
26
  end
23
-
24
27
  end
25
28
  end
@@ -4,45 +4,56 @@ require 'bioinform/parsers'
4
4
  module Bioinform
5
5
  IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3, A: 0, C: 1, G: 2, T: 3}
6
6
  LetterByIndex = {0 => :A, 1 => :C, 2 => :G, 3 => :T}
7
-
7
+
8
8
  class PM
9
- attr_reader :matrix
9
+ attr_reader :matrix, :tags
10
10
  attr_accessor :background, :name
11
-
11
+
12
+ def mark(tag)
13
+ tags << tag
14
+ end
15
+
16
+ def tagged?(tag)
17
+ tags.any?{|t| (t.eql? tag) || (t.respond_to?(:name) && t.name && (t.name == tag)) }
18
+ end
19
+
12
20
  def self.choose_parser(input)
13
21
  [TrivialParser, Parser, StringParser, StringFantomParser].find do |parser|
14
22
  self.new(input, parser) rescue nil
15
23
  end
16
24
  end
17
-
25
+
18
26
  def initialize(input, parser = nil)
19
27
  parser ||= self.class.choose_parser(input)
20
28
  raise 'No one parser can process input' unless parser
21
29
  result = parser.new(input).parse
22
30
  @matrix = result[:matrix]
23
31
  @name = result[:name]
24
- @background = [1, 1, 1, 1]
32
+ @tags = result[:tags] || []
33
+ @background = result[:background] || [1, 1, 1, 1]
25
34
  raise 'matrix not valid' unless valid?
26
35
  end
27
-
36
+
28
37
  def ==(other)
29
- @matrix == other.matrix && @background == other.background
38
+ @matrix == other.matrix && @background == other.background && @name == other.name
39
+ rescue
40
+ false
30
41
  end
31
-
42
+
32
43
  def self.valid_matrix?(matrix)
33
44
  matrix.is_a?(Array) &&
34
45
  ! matrix.empty? &&
35
46
  matrix.all?{|pos| pos.is_a?(Array)} &&
36
47
  matrix.all?{|pos| pos.size == 4} &&
37
48
  matrix.all?{|pos| pos.all?{|el| el.is_a?(Numeric)}}
38
- rescue
49
+ rescue
39
50
  false
40
51
  end
41
-
52
+
42
53
  def valid?
43
54
  self.class.valid_matrix?(@matrix)
44
55
  end
45
-
56
+
46
57
  def each_position
47
58
  if block_given?
48
59
  matrix.each{|pos| yield pos}
@@ -50,12 +61,12 @@ module Bioinform
50
61
  Enumerator.new(self, :each_position)
51
62
  end
52
63
  end
53
-
64
+
54
65
  def length
55
66
  @matrix.length
56
67
  end
57
68
  alias_method :size, :length
58
-
69
+
59
70
  def to_s(options = {})
60
71
  default_options = {with_name: true, letters_as_rows: false}
61
72
  options = default_options.merge(options)
@@ -65,41 +76,41 @@ module Bioinform
65
76
  else
66
77
  matrix_str = each_position.map{|pos| pos.join("\t")}.join("\n")
67
78
  end
68
-
69
- if options[:with_name] && @name
79
+
80
+ if options[:with_name] && @name
70
81
  @name + "\n" + matrix_str
71
- else
82
+ else
72
83
  matrix_str
73
84
  end
74
85
  end
75
-
86
+
76
87
  def pretty_string(options = {})
77
88
  default_options = {with_name: true, letters_as_rows: false}
78
-
89
+
79
90
  return to_s(options) if options[:letters_as_rows]
80
-
91
+
81
92
  options = default_options.merge(options)
82
93
  header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
83
94
  matrix_rows = each_position.map do |position|
84
95
  position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
85
96
  end
86
-
97
+
87
98
  matrix_str = matrix_rows.join("\n")
88
-
99
+
89
100
  if options[:with_name] && @name
90
101
  @name + "\n" + header + matrix_str
91
102
  else
92
103
  header + matrix_str
93
104
  end
94
105
  end
95
-
106
+
96
107
  def to_hash
97
- hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
108
+ hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
98
109
  [ letter, @matrix.map{|pos| pos[letter_index]} ]
99
110
  end
100
111
  hsh.with_indifferent_access
101
112
  end
102
-
113
+
103
114
  # pm.background - returns a @background attribute
104
115
  # pm.background(new_background) - sets an attribute and returns pm itself
105
116
  # if more than one argument passed - raises an exception
@@ -110,17 +121,17 @@ module Bioinform
110
121
  else raise ArgumentError, '#background method can get 0 or 1 argument'
111
122
  end
112
123
  end
113
-
124
+
114
125
  def background!(new_background)
115
126
  @background = new_background
116
127
  self
117
128
  end
118
-
129
+
119
130
  def self.zero_column
120
131
  [0, 0, 0, 0]
121
132
  end
122
133
 
123
- def reverse_complement!
134
+ def reverse_complement!
124
135
  @matrix.reverse!.map!(&:reverse!)
125
136
  self
126
137
  end
@@ -132,10 +143,7 @@ module Bioinform
132
143
  n.times{ @matrix.push(self.class.zero_column) }
133
144
  self
134
145
  end
135
- def shift_to_zero! # make worst score == 0 by shifting scores of each column
136
- @matrix.map!{|position| min = position.min; position.map{|element| element - min}}
137
- self
138
- end
146
+
139
147
  def discrete!(rate)
140
148
  @matrix.map!{|position| position.map{|element| (element * rate).ceil}}
141
149
  self
@@ -144,13 +152,12 @@ module Bioinform
144
152
  def vocabulary_volume
145
153
  background.inject(&:+) ** length
146
154
  end
147
-
155
+
148
156
  def probability
149
157
  sum = background.inject(0.0, &:+)
150
158
  background.map{|element| element.to_f / sum}
151
159
  end
152
160
 
153
-
154
161
  #def split(first_chunk_length)
155
162
  # [@matrix.first(first_chunk_length), matrix.last(length - first_chunk_length)]
156
163
  #end
@@ -164,16 +171,16 @@ module Bioinform
164
171
  def worst_score
165
172
  @matrix.inject(0.0){|sum, col| sum + col.min}
166
173
  end
167
-
174
+
168
175
  # best score of suffix s[i..l]
169
176
  def best_suffix(i)
170
177
  @matrix[i...length].map(&:max).inject(0.0, &:+)
171
178
  end
172
-
179
+
173
180
  def worst_suffix(i)
174
181
  @matrix[i...length].map(&:min).inject(0.0, &:+)
175
182
  end
176
-
183
+
177
184
  def reverse_complement
178
185
  dup.reverse_complement!
179
186
  end
@@ -183,14 +190,22 @@ module Bioinform
183
190
  def right_augment(n)
184
191
  dup.right_augment!(n)
185
192
  end
186
- def shift_to_zero
187
- dup.shift_to_zero!
188
- end
189
193
  def discrete(rate)
190
194
  dup.discrete!(rate)
191
195
  end
192
196
  def dup
193
197
  deep_dup
194
198
  end
199
+
200
+ def to_pcm
201
+ PCM.new(matrix: matrix, name: name, tags: tags, background: background)
202
+ end
203
+ def to_ppm
204
+ PPM.new(matrix: matrix, name: name, tags: tags, background: background)
205
+ end
206
+ def to_pwm
207
+ PWM.new(matrix: matrix, name: name, tags: tags, background: background)
208
+ end
209
+
195
210
  end
196
211
  end