RubyGems - macroape - Versions diffs - 4.0.2 → 4.1.0 - Mend

macroape 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

checksums.yaml +4 -4
data/.gitignore +17 -17
data/Gemfile +4 -4
data/LICENSE +22 -22
data/README.md +70 -70
data/Rakefile.rb +49 -49
data/TODO.txt +46 -46
data/benchmark/benchmark_helper.rb +4 -4
data/benchmark/similarity_benchmark.rb +52 -52
data/bin/align_motifs +4 -4
data/bin/eval_alignment +4 -4
data/bin/eval_similarity +4 -4
data/bin/find_pvalue +4 -4
data/bin/find_threshold +4 -4
data/bin/preprocess_collection +4 -4
data/bin/scan_collection +4 -4
data/lib/macroape.rb +14 -11
data/lib/macroape/aligned_pair_intersection.rb +61 -62
data/lib/macroape/cli.rb +191 -188
data/lib/macroape/cli/align_motifs.rb +120 -100
data/lib/macroape/cli/eval_alignment.rb +157 -156
data/lib/macroape/cli/eval_similarity.rb +138 -137
data/lib/macroape/cli/find_pvalue.rb +93 -87
data/lib/macroape/cli/find_threshold.rb +103 -96
data/lib/macroape/cli/preprocess_collection.rb +169 -161
data/lib/macroape/cli/scan_collection.rb +171 -163
data/lib/macroape/collection.rb +29 -0
data/lib/macroape/motif_with_thresholds.rb +18 -0
data/lib/macroape/pwm_compare.rb +39 -44
data/lib/macroape/pwm_compare_aligned.rb +139 -130
data/lib/macroape/{counting.rb → pwm_counting.rb} +175 -121
data/lib/macroape/support/inverf.rb +13 -0
data/lib/macroape/support/partial_sums.rb +17 -0
data/lib/macroape/version.rb +4 -4
data/macroape.gemspec +19 -19
data/spec/count_distribution_spec.rb +112 -109
data/spec/inverf_spec.rb +23 -0
data/spec/partial_sums_spec.rb +28 -0
data/spec/spec_helper.rb +11 -11
data/test/align_motifs_test.rb +42 -43
data/test/data/AHR_si.pwm +10 -10
data/test/data/KLF3_f1.pcm +16 -16
data/test/data/KLF3_f1.pwm +16 -16
data/test/data/KLF4_f2.pcm +11 -11
data/test/data/KLF4_f2.pwm +11 -11
data/test/data/KLF4_f2_scan_results_all.txt +2 -2
data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -1
data/test/data/KLF4_f2_scan_results_precise_mode.txt +2 -2
data/test/data/SP1_f1.pcm +12 -12
data/test/data/SP1_f1.pwm +12 -12
data/test/data/SP1_f1_revcomp.pcm +12 -12
data/test/data/SP1_f1_revcomp.pwm +12 -12
data/test/data/medium_motif.pwm +8 -8
data/test/data/short_motif.pwm +7 -7
data/test/data/test_collection.yaml +231 -214
data/test/data/test_collection/GABPA_f1.pwm +14 -14
data/test/data/test_collection/KLF4_f2.pwm +10 -10
data/test/data/test_collection/SP1_f1.pwm +12 -12
data/test/data/test_collection_pcm/GABPA_f1.pcm +14 -14
data/test/data/test_collection_pcm/KLF4_f2.pcm +11 -11
data/test/data/test_collection_pcm/SP1_f1.pcm +12 -12
data/test/data/test_collection_single_file.txt +38 -38
data/test/data/test_collection_single_file_pcm.txt +37 -37
data/test/data/test_collection_weak.yaml +231 -214
data/test/eval_alignment_test.rb +90 -111
data/test/eval_similarity_test.rb +105 -123
data/test/find_pvalue_test.rb +34 -39
data/test/find_threshold_test.rb +87 -91
data/test/preprocess_collection_test.rb +56 -65
data/test/scan_collection_test.rb +42 -48
data/test/test_helper.rb +159 -160
metadata +14 -10
data/test/data/collection_pcm_without_thresholds.yaml +0 -188
data/test/data/collection_without_thresholds.yaml +0 -188

data/lib/macroape/pwm_compare_aligned.rb CHANGED

@@ -1,130 +1,139 @@
-require 'bioinform/support/parameters'
-require_relative 'aligned_pair_intersection'
-module Macroape
-  class PWMCompareAligned
-    include Bioinform::Parameters
-    # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
-    make_parameters :max_pair_hash_size
-    attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
-    def initialize(first_unaligned, second_unaligned, shift, orientation)
-      @parameters = OpenStruct.new
-      @shift, @orientation = shift, orientation
-      @first_length, @second_length = first_unaligned.length, second_unaligned.length
-      @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
-      first, second = first_unaligned, second_unaligned
-      second = second.reverse_complement  if revcomp?
-      if shift > 0
-        second = second.left_augment(shift)
-      else
-        first = first.left_augment(-shift)
-      end
-      @first = first.right_augment(@length - first.length)
-      @second = second.right_augment(@length - second.length)
-    end
-    def direct?
-      orientation == :direct
-    end
-    def revcomp?
-      orientation == :revcomp
-    end
-    def overlap
-      length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
-    end
-    def first_pwm_alignment
-      length.times.map do |pos|
-        if first_overlaps?(pos)
-          '>'
-        else
-          '.'
-        end
-      end.join
-    end
-    def second_pwm_alignment
-      length.times.map do |pos|
-        if second_overlaps?(pos)
-          direct? ? '>' : '<'
-        else
-          '.'
-        end
-      end.join
-    end
-    def alignment_infos
-      {shift: shift,
-      orientation: orientation,
-      text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
-      overlap: overlap,
-      alignment_length: length}
-    end
-    # whether first matrix overlap specified position of alignment
-    def first_overlaps?(pos)
-      return false unless pos >= 0 && pos < length
-      if shift > 0
-        pos < first_length
-      else
-        pos >= -shift && pos < -shift + first_length
-      end
-    end
-    def second_overlaps?(pos)
-      return false unless pos >= 0 && pos < length
-      if shift > 0
-        pos >= shift && pos < shift + second_length
-      else
-        pos < second_length
-      end
-    end
-    def jaccard(first_threshold, second_threshold)
-      f = first.count_by_threshold(first_threshold)
-      s = second.count_by_threshold(second_threshold)
-      if f == 0 || s == 0
-        return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
-              recognized_by_first: f,
-              recognized_by_second: s,
-            }
-      end
-      intersect = counts_for_two_matrices(first_threshold, second_threshold)
-      intersect = Math.sqrt(intersect[0] * intersect[1])
-      union = f + s - intersect
-      similarity = intersect.to_f / union
-      { similarity: similarity,  tanimoto: 1.0 - similarity,  recognized_by_both: intersect,
-        recognized_by_first: f,  recognized_by_second: s,
-        real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
-    end
-    def jaccard_by_pvalue(pvalue)
-      threshold_first = first.threshold(pvalue)
-      threshold_second = second.threshold(pvalue)
-      jaccard(threshold_first, threshold_second)
-    end
-    def jaccard_by_weak_pvalue(pvalue)
-      threshold_first = first.weak_threshold(pvalue)
-      threshold_second = second.weak_threshold(pvalue)
-      jaccard(threshold_first, threshold_second)
-    end
-    def self.calculate_alignment_length(first_len, second_len, shift)
-      if shift > 0
-        [first_len, second_len + shift].max
-      else
-        [first_len - shift, second_len].max
-      end
-    end
-  end
-end
+require_relative 'aligned_pair_intersection'
+module Macroape
+  class PWMCounting
+    def left_augmented(n)
+      PWMCounting.new(pwm.left_augmented(n), background: background, max_hash_size: max_hash_size)
+    end
+    def right_augmented(n)
+      PWMCounting.new(pwm.right_augmented(n), background: background, max_hash_size: max_hash_size)
+    end
+    def reverse_complemented
+      PWMCounting.new(pwm.reverse_complemented, background: background, max_hash_size: max_hash_size)
+    end
+  end
+  class PWMCompareAligned
+    # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
+    attr_accessor :max_pair_hash_size
+    attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length
+    # first_unaligned and second_unaligned - PWMCounting objects, not PWMs
+    def initialize(first_unaligned, second_unaligned, shift, orientation)
+      @shift, @orientation = shift, orientation
+      @first_length, @second_length = first_unaligned.length, second_unaligned.length
+      @length = self.class.calculate_alignment_length(@first_length, @second_length, @shift)
+      first, second = first_unaligned, second_unaligned
+      second = second.reverse_complemented  if revcomp?
+      if shift > 0
+        second = second.left_augmented(shift)
+      else
+        first = first.left_augmented(-shift)
+      end
+      @first = first.right_augmented(@length - first.length)
+      @second = second.right_augmented(@length - second.length)
+    end
+    def direct?
+      orientation == :direct
+    end
+    def revcomp?
+      orientation == :revcomp
+    end
+    def overlap
+      length.times.count{|pos| first_overlaps?(pos) && second_overlaps?(pos) }
+    end
+    def first_pwm_alignment
+      length.times.map do |pos|
+        if first_overlaps?(pos)
+          '>'
+        else
+          '.'
+        end
+      end.join
+    end
+    def second_pwm_alignment
+      length.times.map do |pos|
+        if second_overlaps?(pos)
+          direct? ? '>' : '<'
+        else
+          '.'
+        end
+      end.join
+    end
+    def alignment_infos
+      {shift: shift,
+      orientation: orientation,
+      text: "#{first_pwm_alignment}\n#{second_pwm_alignment}",
+      overlap: overlap,
+      alignment_length: length}
+    end
+    # whether first matrix overlap specified position of alignment
+    def first_overlaps?(pos)
+      return false unless pos >= 0 && pos < length
+      if shift > 0
+        pos < first_length
+      else
+        pos >= -shift && pos < -shift + first_length
+      end
+    end
+    def second_overlaps?(pos)
+      return false unless pos >= 0 && pos < length
+      if shift > 0
+        pos >= shift && pos < shift + second_length
+      else
+        pos < second_length
+      end
+    end
+    def jaccard(first_threshold, second_threshold)
+      f = first.count_by_threshold(first_threshold)
+      s = second.count_by_threshold(second_threshold)
+      if f == 0 || s == 0
+        return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
+              recognized_by_first: f,
+              recognized_by_second: s,
+            }
+      end
+      intersect = counts_for_two_matrices(first_threshold, second_threshold)
+      intersect = Math.sqrt(intersect[0] * intersect[1])
+      union = f + s - intersect
+      similarity = intersect.to_f / union
+      { similarity: similarity,  tanimoto: 1.0 - similarity,  recognized_by_both: intersect,
+        recognized_by_first: f,  recognized_by_second: s,
+        real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
+    end
+    def jaccard_by_pvalue(pvalue)
+      threshold_first = first.threshold(pvalue)
+      threshold_second = second.threshold(pvalue)
+      jaccard(threshold_first, threshold_second)
+    end
+    def jaccard_by_weak_pvalue(pvalue)
+      threshold_first = first.weak_threshold(pvalue)
+      threshold_second = second.weak_threshold(pvalue)
+      jaccard(threshold_first, threshold_second)
+    end
+    def self.calculate_alignment_length(first_len, second_len, shift)
+      if shift > 0
+        [first_len, second_len + shift].max
+      else
+        [first_len - shift, second_len].max
+      end
+    end
+  end
+end

data/lib/macroape/{counting.rb → pwm_counting.rb} RENAMED

@@ -1,121 +1,175 @@
-require 'bioinform'
-module Bioinform
-  class PWM
-    # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
-    make_parameters :max_hash_size
-    def threshold(pvalue)
-      thresholds(pvalue){|_, thresh, _| return thresh }
-    end
-    def threshold_and_real_pvalue(pvalue)
-      thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
-    end
-    def weak_threshold(pvalue)
-      weak_thresholds(pvalue){|_, thresh, _| return thresh }
-    end
-    def weak_threshold_and_real_pvalue(pvalue)
-      weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
-    end
-    def thresholds(*pvalues)
-      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
-        threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
-        real_pvalue = counts.end.to_f / vocabulary_volume
-        yield pvalue, threshold, real_pvalue
-      end
-    end
-    # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
-    def weak_thresholds(*pvalues)
-      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
-        threshold = thresholds.begin.to_f
-        real_pvalue = counts.begin.to_f / vocabulary_volume
-        yield pvalue, threshold, real_pvalue
-      end
-    end
-    def count_distribution_under_pvalue(max_pvalue)
-      cnt_distribution = {}
-      look_for_count = max_pvalue * vocabulary_volume
-      until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
-        begin
-          approximate_threshold = threshold_gauss_estimation(max_pvalue)
-        rescue
-          approximate_threshold = worst_score
-        end
-        cnt_distribution = count_distribution_after_threshold(approximate_threshold)
-        max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
-      end
-      cnt_distribution
-    end
-    # ret-value: hash {pvalue => [thresholds, counts]}
-    # thresholds = left_threshold .. right_threshold  (left_threshold < right_threshold)
-    # counts = left_count .. right_count  (left_count > right_count)
-    def thresholds_by_pvalues(*pvalues)
-      sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
-      scores = sorted_scores.map{|score,count| score}
-      counts = sorted_scores.map{|score,count| count}
-      partial_sums = counts.partial_sums
-      results = {}
-      pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
-      pvalue_counts.map do |pvalue,look_for_count|
-        ind = partial_sums.index{|sum| sum >= look_for_count}
-        minscore, count_at_minscore = scores[ind], partial_sums[ind]
-        maxscore, count_at_maxscore = ind > 0  ?  [ scores[ind-1],  partial_sums[ind-1] ]  :  [ best_score + 1.0, 0.0 ]
-        results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
-      end
-      results
-    end
-    def count_distribution_after_threshold(threshold)
-      return @count_distribution.select{|score, count| score >= threshold}  if @count_distribution
-      scores = { 0 => 1 }
-      length.times do |column|
-        scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
-        raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold'  if max_hash_size && scores.size > max_hash_size
-      end
-      scores
-    end
-    def count_distribution
-      @count_distribution ||= count_distribution_after_threshold(worst_score)
-    end
-    def recalc_score_hash(scores, column, least_sufficient)
-      new_scores = Hash.new(0)
-      scores.each do |score, count|
-        4.times do |letter|
-          new_score = score + column[letter]
-          if new_score >= least_sufficient
-            new_scores[new_score] += count * background[letter]
-          end
-        end
-      end
-      new_scores
-    end
-    def counts_by_thresholds(*thresholds)
-      scores = count_distribution_after_threshold(thresholds.min)
-      thresholds.inject({}){ |hsh, threshold|
-        hsh[threshold] = scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
-        hsh
-      }
-    end
-    def count_by_threshold(threshold)
-      counts_by_thresholds(threshold)[threshold]
-    end
-    def pvalue_by_threshold(threshold)
-      count_by_threshold(threshold) / vocabulary_volume
-    end
-  end
-end
+require 'bioinform'
+require_relative 'support/inverf'
+require_relative 'support/partial_sums'
+module Macroape
+  class PWMCounting
+    attr_accessor :pwm, :max_hash_size, :background
+    def initialize(pwm, background: Bioinform::Background::Wordwise, max_hash_size: nil)
+      @pwm = pwm
+      @background = background
+      @max_hash_size = max_hash_size
+    end
+    def matrix
+      pwm.matrix
+    end
+    def vocabulary_volume
+      background.volume ** length
+    end
+    def threshold_gauss_estimation(max_pvalue)
+      pwm.threshold_gauss_estimation(max_pvalue)
+    end
+    def length
+      pwm.length
+    end
+    def best_score
+      best_suffix(0)
+    end
+    def worst_score
+      worst_suffix(0)
+    end
+    # best score of suffix s[i..l]
+    def best_suffix(i)
+      matrix[i...length].map(&:max).inject(0.0, &:+)
+    end
+    def worst_suffix(i)
+      matrix[i...length].map(&:min).inject(0.0, &:+)
+    end
+    def score_mean
+      pwm.each_position.inject(0.0){|mean, position| mean + background.mean(position) }
+    end
+    def score_variance
+      pwm.each_position.inject(0.0){|variance, position| variance + background.mean_square(position) - background.mean(position) **2 }
+    end
+    def threshold_gauss_estimation(pvalue)
+      sigma = Math.sqrt(score_variance)
+      n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
+      score_mean + n_ * sigma
+    end
+    def threshold(pvalue)
+      thresholds(pvalue){|_, thresh, _| return thresh }
+    end
+    def threshold_and_real_pvalue(pvalue)
+      thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
+    end
+    def weak_threshold(pvalue)
+      weak_thresholds(pvalue){|_, thresh, _| return thresh }
+    end
+    def weak_threshold_and_real_pvalue(pvalue)
+      weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
+    end
+    def thresholds(*pvalues)
+      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
+        threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
+        real_pvalue = counts.end.to_f / vocabulary_volume
+        yield pvalue, threshold, real_pvalue
+      end
+    end
+    # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
+    def weak_thresholds(*pvalues)
+      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
+        threshold = thresholds.begin.to_f
+        real_pvalue = counts.begin.to_f / vocabulary_volume
+        yield pvalue, threshold, real_pvalue
+      end
+    end
+    def count_distribution_under_pvalue(max_pvalue)
+      cnt_distribution = {}
+      look_for_count = max_pvalue * vocabulary_volume
+      until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
+        begin
+          approximate_threshold = threshold_gauss_estimation(max_pvalue)
+        rescue
+          approximate_threshold = worst_score
+        end
+        cnt_distribution = count_distribution_after_threshold(approximate_threshold)
+        max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
+      end
+      cnt_distribution
+    end
+    # ret-value: hash {pvalue => [thresholds, counts]}
+    # thresholds = left_threshold .. right_threshold  (left_threshold < right_threshold)
+    # counts = left_count .. right_count  (left_count > right_count)
+    def thresholds_by_pvalues(*pvalues)
+      sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
+      scores = sorted_scores.map{|score,count| score}
+      counts = sorted_scores.map{|score,count| count}
+      partial_sums = counts.partial_sums
+      results = {}
+      pvalue_counts = pvalues.sort.each_with_object({}){|pvalue, hsh| hsh[pvalue] = pvalue * vocabulary_volume }
+      pvalue_counts.map do |pvalue,look_for_count|
+        ind = partial_sums.index{|sum| sum >= look_for_count}
+        minscore, count_at_minscore = scores[ind], partial_sums[ind]
+        maxscore, count_at_maxscore = ind > 0  ?  [ scores[ind-1],  partial_sums[ind-1] ]  :  [ best_score + 1.0, 0.0 ]
+        results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
+      end
+      results
+    end
+    def count_distribution_after_threshold(threshold)
+      return @count_distribution.select{|score, count| score >= threshold}  if @count_distribution
+      scores = { 0 => 1 }
+      length.times do |column|
+        scores.replace recalc_score_hash(scores, matrix[column], threshold - best_suffix(column + 1))
+        raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold'  if max_hash_size && scores.size > max_hash_size
+      end
+      scores
+    end
+    def count_distribution
+      @count_distribution ||= count_distribution_after_threshold(worst_score)
+    end
+    def recalc_score_hash(scores, column, least_sufficient)
+      new_scores = Hash.new(0)
+      scores.each do |score, count|
+        4.times do |letter|
+          new_score = score + column[letter]
+          if new_score >= least_sufficient
+            new_scores[new_score] += count * background.counts[letter]
+          end
+        end
+      end
+      new_scores
+    end
+    def counts_by_thresholds(*thresholds)
+      scores = count_distribution_after_threshold(thresholds.min)
+      thresholds.inject({}){ |hsh, threshold|
+        hsh[threshold] = scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+        hsh
+      }
+    end
+    def count_by_threshold(threshold)
+      counts_by_thresholds(threshold)[threshold]
+    end
+    def pvalue_by_threshold(threshold)
+      count_by_threshold(threshold) / vocabulary_volume
+    end
+  end
+end