RubyGems - macroape - Versions diffs - 3.3.2 → 3.3.3 - Mend

macroape 3.3.2 → 3.3.3

Files changed (29) hide show

data/.gitignore +0 -1
data/Rakefile.rb +65 -0
data/TODO.txt +20 -0
data/benchmark/similarity_benchmark.rb +56 -0
data/lib/macroape.rb +1 -2
data/lib/macroape/aligned_pair_intersection.rb +43 -116
data/lib/macroape/collection.rb +4 -4
data/lib/macroape/{threshold_by_pvalue.rb → counting.rb} +28 -18
data/lib/macroape/exec/eval_alignment.rb +19 -22
data/lib/macroape/exec/eval_similarity.rb +13 -13
data/lib/macroape/exec/find_pvalue.rb +7 -7
data/lib/macroape/exec/find_threshold.rb +8 -8
data/lib/macroape/exec/preprocess_collection.rb +8 -8
data/lib/macroape/exec/scan_collection.rb +16 -16
data/lib/macroape/pwm_compare.rb +2 -3
data/lib/macroape/pwm_compare_aligned.rb +34 -26
data/lib/macroape/version.rb +1 -1
data/spec/count_distribution_spec.rb +52 -0
data/spec/spec_helper.rb +4 -0
data/test/eval_alignment_similarity_test.rb +1 -0
data/test/eval_similarity_test.rb +1 -0
data/test/find_pvalue_test.rb +1 -0
data/test/find_threshold_test.rb +1 -0
data/test/preprocess_collection_test.rb +1 -0
data/test/scan_collection_test.rb +1 -0
data/test/test_helper.rb +4 -4
metadata +10 -5
data/Rakefile +0 -28
data/lib/macroape/count_by_threshold.rb +0 -16

data/.gitignore CHANGED Viewed

@@ -15,4 +15,3 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
-TODO.txt

data/Rakefile.rb ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+namespace :spec do
+  task :find_threshold do
+    system("ruby -I ./test test/find_threshold_test.rb")
+  end
+  task :find_pvalue do
+    system("ruby -I ./test test/find_pvalue_test.rb")
+  end
+  task :eval_similarity do
+    system("ruby -I ./test test/eval_similarity_test.rb")
+  end
+  task :eval_alignment_similarity do
+    system("ruby -I ./test test/eval_alignment_similarity_test.rb")
+  end
+  task :preprocess_collection do
+    system("ruby -I ./test test/preprocess_collection_test.rb")
+  end
+  task :scan_collection do
+    system("ruby -I ./test test/scan_collection_test.rb")
+  end
+  task :tests => [:find_threshold, :find_pvalue, :eval_similarity,
+                :eval_alignment_similarity, :scan_collection, :preprocess_collection]
+  RSpec::Core::RakeTask.new
+end
+desc 'Test all functionality of gem executables'
+task :spec => ['spec:tests', 'spec:spec']
+namespace :benchmark do
+  task :run do
+    require 'open3'
+    time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
+    File.open('benchmark/benchmark.log','a') do |f|
+      f.puts "=========================================================\n#{time}\n"
+      Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
+        Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
+          benchmark_name = File.basename(benchmark_filename)
+          out_str = out.read
+          err_str = err.read
+          benchmark_infos =  "-------------------\n#{benchmark_name}:\n#{out_str}\n"
+          benchmark_infos_to_file = benchmark_infos
+          puts benchmark_infos
+          if err_str && !err_str.empty?
+            STDERR.puts(err_str)
+            benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
+          end
+          # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
+          f.puts benchmark_infos_to_file
+        end
+      end
+    end
+  end
+  task :show do
+    puts File.read('benchmark/benchmark.log')
+  end
+end
+task :benchmark => 'benchmark:run'

data/TODO.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Absolutely necessary:
+  Repair obtaining matrix not only from files but from stdin
+  Make it available to load collections in preprocess_collection from single file (and from stdin of certainly)
+  Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
+Specs and tests:
+  create spec on use of MaxHashSize, MaxHashSizeDouble
+  create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
+  create test for getting PWMs from stdin
+  create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
+Ideas to inctrease perfomance:
+  - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions
+  - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
+  - Create JAVA extension for alignment_intersection methods in order to increase perfomance
+  - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
+Usability issues:
+  review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)

data/benchmark/similarity_benchmark.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'benchmark'
+$:.unshift File.join(File.dirname(__FILE__),'../lib')
+require 'macroape'
+class TaskToBenchmark
+  def setup
+    @matrix_first = "KLF4_f2.xml
+      0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
+      -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
+      -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
+      -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
+      -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
+      -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
+      -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
+      -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
+      -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
+      -1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
+      "
+    @matrix_second = "> SP1_f1
+    -0.24435707885585334  -0.6748234046937317  0.8657012535789861  -1.1060188862599292
+    -1.0631255752097801  -2.1119259694238686  1.0960627561110399  -0.6138563775211981
+    -0.387227623476054  -2.973985191321805  1.1807800242010371  -4.338927525031567
+    -4.563896055436894  -2.916163300253228  1.3684371349982631  -5.077972423609655
+    -2.2369752892820087  -3.719643631330185  1.3510439136452728  -4.8899306705082335
+    -0.07473964149330914  0.9449196547620103  -2.624685764808605  -0.851098348782244
+    -1.9643526491643326  -2.9784027708801153  1.3113096718240569  -2.3243342594990253
+    -4.015548413965584  -3.138426807809667  1.338748858978805  -2.0846739035376483
+    -0.4450938582835542  -2.2510053061629707  1.126543157436868  -1.7780413702431377
+    -1.1896356092245055  -1.2251832285630033  1.163676006374752  -1.6080243648157357
+    -0.5166047365590577  0.7641033353626651  -0.28626775700282125  -0.6825482097865606"
+    @pvalue = 0.0005
+    @discretization = 10
+    @first_background, @second_background = [1,1,1,1], [1,1,1,1]
+    @pwm_first = Bioinform::PWM.new(@matrix_first).background(@first_background).discrete(@discretization)
+    @pwm_second = Bioinform::PWM.new(@matrix_second).background(@second_background).discrete(@discretization)
+    @cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
+    self
+  end
+  def run
+    first_threshold = @pwm_first.threshold(@pvalue)
+    second_threshold = @pwm_second.threshold(@pvalue)
+    info = @cmp.jaccard(first_threshold, second_threshold)
+  end
+end
+benchmark_result = 10.times.collect do
+  task_to_benchmark = TaskToBenchmark.new.setup
+  Benchmark.measure{  task_to_benchmark.run }
+end.inject(&:+)
+puts benchmark_result

data/lib/macroape.rb CHANGED Viewed

@@ -1,8 +1,7 @@
 require 'macroape/version'
 require 'bioinform'
-require 'macroape/threshold_by_pvalue'
-require 'macroape/count_by_threshold'
+require 'macroape/counting'
 require 'macroape/aligned_pair_intersection'
 require 'macroape/pwm_compare_aligned'

data/lib/macroape/aligned_pair_intersection.rb CHANGED Viewed

@@ -1,136 +1,63 @@
 module Macroape
   class PWMCompareAligned
+    # unoptimized version of this and related methods
     def counts_for_two_matrices(threshold_first, threshold_second)
-      if first.background == second.background
-        if first.background == [1,1,1,1]
-          common_words_for_two_matrices(threshold_first, threshold_second)
-        else
-          counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
-        end
+      # just not to call method each time
+      first_background = first.background
+      second_background = second.background
+      unless first_background == second_background
+        first_result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
+        second_result = get_counts(threshold_first, threshold_second) {|score,letter| second_background[letter] * score }
+        return [first_result, second_result]
+      end
+      if first.background == [1,1,1,1]
+        result = get_counts(threshold_first, threshold_second) {|score,letter| score}
+        [result, result]
       else
-        counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
+        result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
+        [result, result]
       end
     end
-    def counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
-      scores = { 0 => {0 => [1,1]} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
-      result_first = 0.0
-      result_second = 0.0
-      length.times do |column|
-        ending_weight_first =  first.background_sum ** (length - column - 1)
-        ending_weight_second = second.background_sum ** (length - column - 1)
-        already_enough_first  = threshold_first  - first.worst_suffix[column + 1]
-        already_enough_second = threshold_second - second.worst_suffix[column + 1]
-        least_sufficient_first  = threshold_first  - first.best_suffix[column + 1]
-        least_sufficient_second = threshold_second - second.best_suffix[column + 1]
-        new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=[0,0]}}
-        scores.each do |score_first, second_scores|
-          second_scores.each do |score_second, count|
-            4.times do |letter|
-              new_score_first = score_first + first.matrix[column][letter]
-              if new_score_first >= already_enough_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= already_enough_second
-                  result_first += count[0] * first.background[letter] * ending_weight_first
-                  result_second += count[1] * second.background[letter] * ending_weight_second
-                elsif new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
-                  new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
-                end
-              elsif new_score_first >= least_sufficient_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
-                  new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
-                end
-              end
-            end
-          end
+    # block has form: {|score,letter| contribution to count by `letter` with `score` }
+    def get_counts(threshold_first, threshold_second, &count_contribution_block)
+      # scores_on_first_pwm, scores_on_second_pwm --> count
+      scores = { 0 => {0 => 1} }
+      length.times do |column|
+        new_scores = recalc_score_hash(scores,
+                          @first.matrix[column], @second.matrix[column],
+                          threshold_first - first.best_suffix(column + 1),
+                          threshold_second - second.best_suffix(column + 1), &count_contribution_block)
+        scores.replace(new_scores)
+        if defined?(MaxHashSizeDouble) && scores.inject(0){|sum,hsh|sum + hsh.size} > MaxHashSizeDouble
+          raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
         end
-        raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
-        scores = new_scores
       end
-      [result_first, result_second]
+      scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
     end
-    def counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
-      scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
-      result = 0.0
-      background = first.background
-      length.times do |column|
-        ending_weight =  first.background_sum ** (length - column - 1)
-        already_enough_first  = threshold_first  - first.worst_suffix[column + 1]
-        already_enough_second = threshold_second - second.worst_suffix[column + 1]
-        least_sufficient_first  = threshold_first  - first.best_suffix[column + 1]
-        least_sufficient_second = threshold_second - second.best_suffix[column + 1]
-        new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
-        scores.each do |score_first, second_scores|
-          second_scores.each do |score_second, count|
-            4.times do |letter|
-              new_score_first = score_first + first.matrix[column][letter]
-              if new_score_first >= already_enough_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= already_enough_second
-                  result += count * background[letter] * ending_weight
-                elsif new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second] += count * background[letter]
-                end
-              elsif new_score_first >= least_sufficient_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second] += count * background[letter]
-                end
-              end
-            end
-          end
-        end
-        raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_same_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
-        scores = new_scores
-      end
-      [result, result]
-    end
-    def common_words_for_two_matrices(threshold_first, threshold_second)
-      scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
-      result = 0
-      length.times do |column|
-        ending_weight =  4 ** (length - column - 1)
-        already_enough_first  = threshold_first  - first.worst_suffix[column + 1]
-        already_enough_second = threshold_second - second.worst_suffix[column + 1]
-        least_sufficient_first  = threshold_first  - first.best_suffix[column + 1]
-        least_sufficient_second = threshold_second - second.best_suffix[column + 1]
+    # wouldn't work without count_contribution_block
+    def recalc_score_hash(scores, first_column, second_column, least_sufficient_first, least_sufficient_second)
+      new_scores = Hash.new{|h,k| h[k] = Hash.new(0)}
+      scores.each do |score_first, second_scores|
+        second_scores.each do |score_second, count|
-        new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
-        scores.each do |score_first, second_scores|
-          second_scores.each do |score_second, count|
-            4.times do |letter|
-              new_score_first = score_first + first.matrix[column][letter]
-              if new_score_first >= already_enough_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= already_enough_second
-                  result += count * ending_weight
-                elsif new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second] += count
-                end
-              elsif new_score_first >= least_sufficient_first
-                new_score_second = score_second + second.matrix[column][letter]
-                if new_score_second >= least_sufficient_second
-                  new_scores[new_score_first][new_score_second] += count
-                end
+          4.times do |letter|
+            new_score_first = score_first + first_column[letter]
+            if new_score_first >= least_sufficient_first
+              new_score_second = score_second + second_column[letter]
+              if new_score_second >= least_sufficient_second
+                new_scores[new_score_first][new_score_second] += yield(count, letter)
               end
             end
           end
         end
-        raise 'Hash overflow in Macroape::AlignedPairIntersection#common_words_for_two_matrices' if defined? MaxHashSizeDouble and new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
-        scores = new_scores
       end
-      [result, result]
+      new_scores
     end
   end
 end

data/lib/macroape/collection.rb CHANGED Viewed

@@ -11,10 +11,10 @@ module Macroape
       @infos[pwm.name] = info
     end
     def ==(other)
-      @rough_discretization == other.rough_discretization &&
-      @precise_discretization == other.precise_discretization &&
-      @background == other.background &&
-      @pvalues == other.pvalues &&
+      @rough_discretization == other.rough_discretization &&
+      @precise_discretization == other.precise_discretization &&
+      @background == other.background &&
+      @pvalues == other.pvalues &&
       @pwms == other.pwms &&
       @infos == other.infos
     end

data/lib/macroape/{threshold_by_pvalue.rb → counting.rb} RENAMED Viewed

@@ -3,7 +3,7 @@ module Bioinform
     def threshold(pvalue)
       thresholds(pvalue){|_, thresh, _| return thresh }
     end
     def thresholds(*pvalues)
       thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
         threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
@@ -11,31 +11,30 @@ module Bioinform
         yield pvalue, threshold, real_pvalue
       end
     end
     def count_distribution_under_pvalue(max_pvalue)
-      count_distribution={}
+      cnt_distribution = {}
       look_for_count = max_pvalue * vocabulary_volume
-      until count_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
-        count_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
+      until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
+        cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
         max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
       end
-      count_distribution
+      cnt_distribution
     end
     # ret-value: hash {pvalue => [thresholds, counts]}
     # thresholds = left_threshold .. right_threshold  (left_threshold < right_threshold)
     # counts = left_count .. right_count  (left_count > right_count)
     def thresholds_by_pvalues(*pvalues)
-      count_distribution = count_distribution_under_pvalue(pvalues.max)
-      sorted_scores = count_distribution.sort.reverse
+      sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
       scores = sorted_scores.map{|score,count| score}
       counts = sorted_scores.map{|score,count| count}
       partial_sums = counts.partial_sums
       results = {}
       pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
       pvalue_counts.map do |pvalue,look_for_count|
         ind = partial_sums.index{|sum| sum >= look_for_count}
@@ -46,18 +45,19 @@ module Bioinform
       results
     end
     def count_distribution_after_threshold(threshold)
+      return @count_distribution.select{|score, count| score >= threshold}  if @count_distribution
       scores = { 0 => 1 }
       length.times do |column|
-        scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix[column + 1])
+        scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
         raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if defined? MaxHashSizeSingle and scores.size > MaxHashSizeSingle
       end
       scores
     end
     def count_distribution
-      count_distribution_after_threshold(worst_score)
+      @count_distribution ||= count_distribution_after_threshold(worst_score)
     end
     def recalc_score_hash(scores, column, least_sufficient)
@@ -72,6 +72,16 @@ module Bioinform
       end
       new_scores
     end
+    def counts_by_thresholds(*thresholds)
+      scores = count_distribution_after_threshold(thresholds.min)
+      thresholds.map{ |threshold|
+        scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+      }
+    end
+    def pvalue_by_threshold(threshold)
+      counts_by_thresholds(threshold).first / vocabulary_volume
+    end
   end
 end