bioinform 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/TODO.txt CHANGED
@@ -1,4 +1,8 @@
1
- Create Collection class. Rewrite MACRO-APE to use this class.
1
+ ToDo:
2
+ how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
3
+
4
+ Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
5
+ Prevent parser going into infinity loop
2
6
 
3
7
  Create CLI-apps:
4
8
  -- to merge many files(or whole folder) to a Collection
@@ -10,9 +14,10 @@ Decide:
10
14
  PPM format such that parser got both matrix and count (if PPM have `word_count`)
11
15
  -- can_parse?
12
16
  -- Whether to cache suffices: cache :best_suffix, obsolete: [:discrete!, :background!, ...]
17
+ -- behaviour of PM#== for PMs with different tags
18
+ -- should background be in PM by default?
19
+ -- refactor PM.new #== and so on to make possible consistently introduce or remove a variable at a single line
20
+ -- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
13
21
 
14
22
  Specs
15
- -- PM#== (!)
16
- -- PWM#probabilities, #score_variance, #gauss_estimation
17
- -- pcm2pwm
18
- -- split_motifs
23
+ -- PWM#probabilities, #score_variance, #gauss_estimation
data/bioinform.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.summary = %q{Classes for work with different input formats of positional matrices and IUPAC-words and making simple transform and statistics with them. Also module includes several useful extensions for Enumerable module like parametric map and callable symbols }
9
9
  gem.homepage = ""
10
10
 
11
- gem.files = `git ls-files`.split($\)
11
+ gem.files = `git ls-files`.split($/)
12
12
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
13
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
14
  gem.name = "bioinform"
@@ -13,7 +13,7 @@ When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_fol
13
13
 
14
14
  Usage:
15
15
  #{__FILE__} [options] [<pcm-files>...]
16
-
16
+
17
17
  Options:
18
18
  -h --help Show this screen.
19
19
  -e --extension EXT Extension of output files [default: pwm]
@@ -21,27 +21,27 @@ Options:
21
21
  DOCOPT
22
22
 
23
23
  options = Docopt::docopt(doc, argv: argv)
24
-
24
+
25
25
  if options['<pcm-files>'].empty?
26
26
  filelist = $stdin.read.shellsplit
27
27
  else
28
28
  filelist = options['<pcm-files>']
29
29
  end
30
-
30
+
31
31
  folder = options['--folder']
32
32
  Dir.mkdir(folder) unless Dir.exist?(folder)
33
-
33
+
34
34
  filelist.each do |pcm_filename|
35
35
  pwm = Bioinform::PCM.new( File.read(pcm_filename) ).to_pwm
36
36
  File.open(Bioinform::CLI.output_filename(pcm_filename, options['--extension'], folder), 'w') do |f|
37
37
  f.puts pwm
38
38
  end
39
39
  end
40
-
40
+
41
41
  rescue Docopt::Exit => e
42
42
  puts e.message
43
43
  end
44
-
44
+
45
45
  end
46
46
  end
47
47
  end
@@ -4,7 +4,7 @@ require 'docopt'
4
4
  module Bioinform
5
5
  module CLI
6
6
  module SplitMotifs
7
-
7
+
8
8
  def self.main(argv)
9
9
  doc = <<-DOCOPT
10
10
  Motif splitter.
@@ -12,7 +12,7 @@ It get a file with a set of motifs and splits it into motifs according to their
12
12
 
13
13
  Usage:
14
14
  #{__FILE__} [options] <collection-file>
15
-
15
+
16
16
  Options:
17
17
  -h --help Show this screen.
18
18
  -m --data-model MODEL Data model: PM, PCM, PPM or PWM [default: PM]
@@ -21,17 +21,17 @@ Options:
21
21
  DOCOPT
22
22
 
23
23
  options = Docopt::docopt(doc, argv: argv)
24
-
24
+
25
25
  folder = options['--folder']
26
26
  Dir.mkdir(folder) unless Dir.exist?(folder)
27
-
27
+
28
28
  data_model = Bioinform.const_get(options['--data-model'].upcase)
29
29
  extension = options['--extension'] || options['--data-model'].downcase
30
-
30
+
31
31
  collection_filename = options['<collection-file>']
32
32
  raise "File #{collection_filename} not exist" unless File.exist? collection_filename
33
33
  input = File.read(collection_filename)
34
-
34
+
35
35
  data_model.choose_parser(input).split_on_motifs(input, data_model).each do |motif|
36
36
  File.open(File.join(folder, "#{motif.name}.#{extension}"), 'w') do |f|
37
37
  f.puts motif
@@ -40,7 +40,7 @@ Options:
40
40
  rescue Docopt::Exit => e
41
41
  puts e.message
42
42
  end
43
-
43
+
44
44
  end
45
45
  end
46
46
  end
@@ -5,5 +5,7 @@ require 'bioinform/data_models/pcm'
5
5
  require 'bioinform/data_models/ppm'
6
6
  require 'bioinform/data_models/pwm'
7
7
 
8
+ require 'bioinform/data_models/collection'
9
+
8
10
  #require 'bioinform/data_models/iupac_word'
9
11
  #require 'bioinform/data_models/iupac_wordset'
@@ -1,2 +1,86 @@
1
- class PMCollection
1
+ require 'ostruct'
2
+
3
+ module Bioinform
4
+ class Collection
5
+ attr_reader :collection, :parameters
6
+
7
+ # collection name is a tag name for each motif in a collection. But motif can be included in several collections so have several tags
8
+ def initialize(parameters = {})
9
+ @collection = []
10
+ @parameters = OpenStruct.new(parameters)
11
+ yield @parameters if block_given?
12
+ end
13
+
14
+ def size
15
+ collection.size
16
+ end
17
+
18
+ def name
19
+ parameters.name
20
+ end
21
+
22
+ def to_s
23
+ "<Collection '#{name}'>"
24
+ end
25
+
26
+ def +(other)
27
+ result = self.class.new
28
+ each do |pm, infos|
29
+ result.add_pm(pm, infos)
30
+ end
31
+ other.each do |pm, infos|
32
+ result.add_pm(pm, infos)
33
+ end
34
+ result
35
+ end
36
+
37
+ def add_pm(pm, info)
38
+ pm.mark(self)
39
+ collection << [pm, info]
40
+ self
41
+ end
42
+
43
+ def <<(pm)
44
+ add_pm(pm, OpenStruct.new)
45
+ end
46
+
47
+ def each
48
+ if block_given?
49
+ collection.each{|pm, infos| yield [pm, infos]}
50
+ else
51
+ Enumerator.new(self, :each)
52
+ end
53
+ end
54
+
55
+ def each_pm
56
+ if block_given?
57
+ each{|pm, infos| yield pm}
58
+ else
59
+ Enumerator.new(self, :each_pm)
60
+ end
61
+ end
62
+
63
+ include Enumerable
64
+
65
+ %w[pcm ppm pwm].each do |data_model|
66
+ method_name = "each_#{data_model}".to_sym #
67
+ converter_method = "to_#{data_model}".to_sym #
68
+ define_method method_name do |&block| # define_method :each_pcm do |&block|
69
+ if block # if block
70
+ each do |pm, infos| # each do |pm, infos|
71
+ block.call pm.send(converter_method) # block.call pm.send(:to_pcm)
72
+ end # end
73
+ else # else
74
+ Enumerator.new(self, method_name) # Enumerator.new(self, :each_pcm)
75
+ end # end
76
+ end # end
77
+ end
78
+
79
+ def ==(other)
80
+ (collection == other.collection) && (parameters == other.parameters)
81
+ rescue
82
+ false
83
+ end
84
+
85
+ end
2
86
  end
@@ -6,20 +6,23 @@ module Bioinform
6
6
  def count
7
7
  matrix.first.inject(&:+)
8
8
  end
9
-
9
+
10
+ def to_pcm
11
+ self
12
+ end
13
+
10
14
  def to_pwm(pseudocount = Math.log(count))
11
15
  mat = each_position.map do |pos|
12
16
  pos.each_index.map do |ind|
13
17
  Math.log((pos[ind] + probability[ind] * pseudocount) / (probability[ind]*(count + pseudocount)) )
14
18
  end
15
19
  end
16
- PWM.new(matrix: mat, name: name)
20
+ PWM.new(matrix: mat, name: name, tags: tags, background: background)
17
21
  end
18
-
22
+
19
23
  def to_ppm
20
24
  mat = each_position.map{|pos| pos.map{|el| el.to_f / count }}
21
- PPM.new(matrix: mat, name: name)
25
+ PPM.new(matrix: mat, name: name, tags: tags, background: background)
22
26
  end
23
-
24
27
  end
25
28
  end
@@ -4,45 +4,56 @@ require 'bioinform/parsers'
4
4
  module Bioinform
5
5
  IndexByLetter = {'A' => 0, 'C' => 1, 'G' => 2, 'T' => 3, A: 0, C: 1, G: 2, T: 3}
6
6
  LetterByIndex = {0 => :A, 1 => :C, 2 => :G, 3 => :T}
7
-
7
+
8
8
  class PM
9
- attr_reader :matrix
9
+ attr_reader :matrix, :tags
10
10
  attr_accessor :background, :name
11
-
11
+
12
+ def mark(tag)
13
+ tags << tag
14
+ end
15
+
16
+ def tagged?(tag)
17
+ tags.any?{|t| (t.eql? tag) || (t.respond_to?(:name) && t.name && (t.name == tag)) }
18
+ end
19
+
12
20
  def self.choose_parser(input)
13
21
  [TrivialParser, Parser, StringParser, StringFantomParser].find do |parser|
14
22
  self.new(input, parser) rescue nil
15
23
  end
16
24
  end
17
-
25
+
18
26
  def initialize(input, parser = nil)
19
27
  parser ||= self.class.choose_parser(input)
20
28
  raise 'No one parser can process input' unless parser
21
29
  result = parser.new(input).parse
22
30
  @matrix = result[:matrix]
23
31
  @name = result[:name]
24
- @background = [1, 1, 1, 1]
32
+ @tags = result[:tags] || []
33
+ @background = result[:background] || [1, 1, 1, 1]
25
34
  raise 'matrix not valid' unless valid?
26
35
  end
27
-
36
+
28
37
  def ==(other)
29
- @matrix == other.matrix && @background == other.background
38
+ @matrix == other.matrix && @background == other.background && @name == other.name
39
+ rescue
40
+ false
30
41
  end
31
-
42
+
32
43
  def self.valid_matrix?(matrix)
33
44
  matrix.is_a?(Array) &&
34
45
  ! matrix.empty? &&
35
46
  matrix.all?{|pos| pos.is_a?(Array)} &&
36
47
  matrix.all?{|pos| pos.size == 4} &&
37
48
  matrix.all?{|pos| pos.all?{|el| el.is_a?(Numeric)}}
38
- rescue
49
+ rescue
39
50
  false
40
51
  end
41
-
52
+
42
53
  def valid?
43
54
  self.class.valid_matrix?(@matrix)
44
55
  end
45
-
56
+
46
57
  def each_position
47
58
  if block_given?
48
59
  matrix.each{|pos| yield pos}
@@ -50,12 +61,12 @@ module Bioinform
50
61
  Enumerator.new(self, :each_position)
51
62
  end
52
63
  end
53
-
64
+
54
65
  def length
55
66
  @matrix.length
56
67
  end
57
68
  alias_method :size, :length
58
-
69
+
59
70
  def to_s(options = {})
60
71
  default_options = {with_name: true, letters_as_rows: false}
61
72
  options = default_options.merge(options)
@@ -65,41 +76,41 @@ module Bioinform
65
76
  else
66
77
  matrix_str = each_position.map{|pos| pos.join("\t")}.join("\n")
67
78
  end
68
-
69
- if options[:with_name] && @name
79
+
80
+ if options[:with_name] && @name
70
81
  @name + "\n" + matrix_str
71
- else
82
+ else
72
83
  matrix_str
73
84
  end
74
85
  end
75
-
86
+
76
87
  def pretty_string(options = {})
77
88
  default_options = {with_name: true, letters_as_rows: false}
78
-
89
+
79
90
  return to_s(options) if options[:letters_as_rows]
80
-
91
+
81
92
  options = default_options.merge(options)
82
93
  header = %w{A C G T}.map{|el| el.rjust(4).ljust(7)}.join + "\n"
83
94
  matrix_rows = each_position.map do |position|
84
95
  position.map{|el| el.round(3).to_s.rjust(6)}.join(' ')
85
96
  end
86
-
97
+
87
98
  matrix_str = matrix_rows.join("\n")
88
-
99
+
89
100
  if options[:with_name] && @name
90
101
  @name + "\n" + header + matrix_str
91
102
  else
92
103
  header + matrix_str
93
104
  end
94
105
  end
95
-
106
+
96
107
  def to_hash
97
- hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
108
+ hsh = %w{A C G T}.each_with_index.collect_hash do |letter, letter_index|
98
109
  [ letter, @matrix.map{|pos| pos[letter_index]} ]
99
110
  end
100
111
  hsh.with_indifferent_access
101
112
  end
102
-
113
+
103
114
  # pm.background - returns a @background attribute
104
115
  # pm.background(new_background) - sets an attribute and returns pm itself
105
116
  # if more than one argument passed - raises an exception
@@ -110,17 +121,17 @@ module Bioinform
110
121
  else raise ArgumentError, '#background method can get 0 or 1 argument'
111
122
  end
112
123
  end
113
-
124
+
114
125
  def background!(new_background)
115
126
  @background = new_background
116
127
  self
117
128
  end
118
-
129
+
119
130
  def self.zero_column
120
131
  [0, 0, 0, 0]
121
132
  end
122
133
 
123
- def reverse_complement!
134
+ def reverse_complement!
124
135
  @matrix.reverse!.map!(&:reverse!)
125
136
  self
126
137
  end
@@ -132,10 +143,7 @@ module Bioinform
132
143
  n.times{ @matrix.push(self.class.zero_column) }
133
144
  self
134
145
  end
135
- def shift_to_zero! # make worst score == 0 by shifting scores of each column
136
- @matrix.map!{|position| min = position.min; position.map{|element| element - min}}
137
- self
138
- end
146
+
139
147
  def discrete!(rate)
140
148
  @matrix.map!{|position| position.map{|element| (element * rate).ceil}}
141
149
  self
@@ -144,13 +152,12 @@ module Bioinform
144
152
  def vocabulary_volume
145
153
  background.inject(&:+) ** length
146
154
  end
147
-
155
+
148
156
  def probability
149
157
  sum = background.inject(0.0, &:+)
150
158
  background.map{|element| element.to_f / sum}
151
159
  end
152
160
 
153
-
154
161
  #def split(first_chunk_length)
155
162
  # [@matrix.first(first_chunk_length), matrix.last(length - first_chunk_length)]
156
163
  #end
@@ -164,16 +171,16 @@ module Bioinform
164
171
  def worst_score
165
172
  @matrix.inject(0.0){|sum, col| sum + col.min}
166
173
  end
167
-
174
+
168
175
  # best score of suffix s[i..l]
169
176
  def best_suffix(i)
170
177
  @matrix[i...length].map(&:max).inject(0.0, &:+)
171
178
  end
172
-
179
+
173
180
  def worst_suffix(i)
174
181
  @matrix[i...length].map(&:min).inject(0.0, &:+)
175
182
  end
176
-
183
+
177
184
  def reverse_complement
178
185
  dup.reverse_complement!
179
186
  end
@@ -183,14 +190,22 @@ module Bioinform
183
190
  def right_augment(n)
184
191
  dup.right_augment!(n)
185
192
  end
186
- def shift_to_zero
187
- dup.shift_to_zero!
188
- end
189
193
  def discrete(rate)
190
194
  dup.discrete!(rate)
191
195
  end
192
196
  def dup
193
197
  deep_dup
194
198
  end
199
+
200
+ def to_pcm
201
+ PCM.new(matrix: matrix, name: name, tags: tags, background: background)
202
+ end
203
+ def to_ppm
204
+ PPM.new(matrix: matrix, name: name, tags: tags, background: background)
205
+ end
206
+ def to_pwm
207
+ PWM.new(matrix: matrix, name: name, tags: tags, background: background)
208
+ end
209
+
195
210
  end
196
211
  end