macroape 3.3.2 → 3.3.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -15,4 +15,3 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
- TODO.txt
data/Rakefile.rb ADDED
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+
5
+ namespace :spec do
6
+ task :find_threshold do
7
+ system("ruby -I ./test test/find_threshold_test.rb")
8
+ end
9
+ task :find_pvalue do
10
+ system("ruby -I ./test test/find_pvalue_test.rb")
11
+ end
12
+ task :eval_similarity do
13
+ system("ruby -I ./test test/eval_similarity_test.rb")
14
+ end
15
+ task :eval_alignment_similarity do
16
+ system("ruby -I ./test test/eval_alignment_similarity_test.rb")
17
+ end
18
+ task :preprocess_collection do
19
+ system("ruby -I ./test test/preprocess_collection_test.rb")
20
+ end
21
+ task :scan_collection do
22
+ system("ruby -I ./test test/scan_collection_test.rb")
23
+ end
24
+ task :tests => [:find_threshold, :find_pvalue, :eval_similarity,
25
+ :eval_alignment_similarity, :scan_collection, :preprocess_collection]
26
+
27
+ RSpec::Core::RakeTask.new
28
+ end
29
+
30
+ desc 'Test all functionality of gem executables'
31
+ task :spec => ['spec:tests', 'spec:spec']
32
+
33
+ namespace :benchmark do
34
+ task :run do
35
+ require 'open3'
36
+ time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
37
+ File.open('benchmark/benchmark.log','a') do |f|
38
+ f.puts "=========================================================\n#{time}\n"
39
+ Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
40
+ Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
41
+ benchmark_name = File.basename(benchmark_filename)
42
+ out_str = out.read
43
+ err_str = err.read
44
+
45
+ benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
46
+ benchmark_infos_to_file = benchmark_infos
47
+ puts benchmark_infos
48
+
49
+ if err_str && !err_str.empty?
50
+ STDERR.puts(err_str)
51
+ benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
52
+ end
53
+
54
+ # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
55
+ f.puts benchmark_infos_to_file
56
+ end
57
+ end
58
+ end
59
+ end
60
+ task :show do
61
+ puts File.read('benchmark/benchmark.log')
62
+ end
63
+ end
64
+
65
+ task :benchmark => 'benchmark:run'
data/TODO.txt ADDED
@@ -0,0 +1,20 @@
1
+ Absolutely necessary:
2
+ Repair obtaining matrix not only from files but from stdin
3
+ Make it available to load collections in preprocess_collection from single file (and from stdin of certainly)
4
+ Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
5
+
6
+ Specs and tests:
7
+ create spec on use of MaxHashSize, MaxHashSizeDouble
8
+ create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
9
+ create test for getting PWMs from stdin
10
+ create test for nonuniform word-wise background([1,1,1,1]) and for different backgrounds
11
+
12
+ Ideas to inctrease perfomance:
13
+ - Add shifting matrix elements to zero after discreeting - in such case worst suffix is zero at all positions
14
+ - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
15
+ - Create JAVA extension for alignment_intersection methods in order to increase perfomance
16
+ - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
17
+
18
+ Usability issues:
19
+ review Collection class. Now its completely unuseful. May be it should be even in another gem (with blackjack and clustering)
20
+
@@ -0,0 +1,56 @@
1
+ require 'benchmark'
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__),'../lib')
4
+ require 'macroape'
5
+
6
+ class TaskToBenchmark
7
+ def setup
8
+ @matrix_first = "KLF4_f2.xml
9
+ 0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
10
+ -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
11
+ -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
12
+ -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
13
+ -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
14
+ -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
15
+ -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
16
+ -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
17
+ -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
18
+ -1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
19
+ "
20
+
21
+ @matrix_second = "> SP1_f1
22
+ -0.24435707885585334 -0.6748234046937317 0.8657012535789861 -1.1060188862599292
23
+ -1.0631255752097801 -2.1119259694238686 1.0960627561110399 -0.6138563775211981
24
+ -0.387227623476054 -2.973985191321805 1.1807800242010371 -4.338927525031567
25
+ -4.563896055436894 -2.916163300253228 1.3684371349982631 -5.077972423609655
26
+ -2.2369752892820087 -3.719643631330185 1.3510439136452728 -4.8899306705082335
27
+ -0.07473964149330914 0.9449196547620103 -2.624685764808605 -0.851098348782244
28
+ -1.9643526491643326 -2.9784027708801153 1.3113096718240569 -2.3243342594990253
29
+ -4.015548413965584 -3.138426807809667 1.338748858978805 -2.0846739035376483
30
+ -0.4450938582835542 -2.2510053061629707 1.126543157436868 -1.7780413702431377
31
+ -1.1896356092245055 -1.2251832285630033 1.163676006374752 -1.6080243648157357
32
+ -0.5166047365590577 0.7641033353626651 -0.28626775700282125 -0.6825482097865606"
33
+
34
+ @pvalue = 0.0005
35
+ @discretization = 10
36
+ @first_background, @second_background = [1,1,1,1], [1,1,1,1]
37
+
38
+ @pwm_first = Bioinform::PWM.new(@matrix_first).background(@first_background).discrete(@discretization)
39
+ @pwm_second = Bioinform::PWM.new(@matrix_second).background(@second_background).discrete(@discretization)
40
+ @cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
41
+ self
42
+ end
43
+
44
+ def run
45
+ first_threshold = @pwm_first.threshold(@pvalue)
46
+ second_threshold = @pwm_second.threshold(@pvalue)
47
+ info = @cmp.jaccard(first_threshold, second_threshold)
48
+ end
49
+ end
50
+
51
+ benchmark_result = 10.times.collect do
52
+ task_to_benchmark = TaskToBenchmark.new.setup
53
+ Benchmark.measure{ task_to_benchmark.run }
54
+ end.inject(&:+)
55
+
56
+ puts benchmark_result
data/lib/macroape.rb CHANGED
@@ -1,8 +1,7 @@
1
1
  require 'macroape/version'
2
2
 
3
3
  require 'bioinform'
4
- require 'macroape/threshold_by_pvalue'
5
- require 'macroape/count_by_threshold'
4
+ require 'macroape/counting'
6
5
 
7
6
  require 'macroape/aligned_pair_intersection'
8
7
  require 'macroape/pwm_compare_aligned'
@@ -1,136 +1,63 @@
1
1
  module Macroape
2
2
  class PWMCompareAligned
3
-
3
+
4
+ # unoptimized version of this and related methods
4
5
  def counts_for_two_matrices(threshold_first, threshold_second)
5
- if first.background == second.background
6
- if first.background == [1,1,1,1]
7
- common_words_for_two_matrices(threshold_first, threshold_second)
8
- else
9
- counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
10
- end
6
+ # just not to call method each time
7
+ first_background = first.background
8
+ second_background = second.background
9
+ unless first_background == second_background
10
+ first_result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
11
+ second_result = get_counts(threshold_first, threshold_second) {|score,letter| second_background[letter] * score }
12
+ return [first_result, second_result]
13
+ end
14
+ if first.background == [1,1,1,1]
15
+ result = get_counts(threshold_first, threshold_second) {|score,letter| score}
16
+ [result, result]
11
17
  else
12
- counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
18
+ result = get_counts(threshold_first, threshold_second) {|score,letter| first_background[letter] * score }
19
+ [result, result]
13
20
  end
14
21
  end
15
22
 
16
- def counts_for_two_matrices_with_different_probabilities(threshold_first, threshold_second)
17
- scores = { 0 => {0 => [1,1]} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
18
- result_first = 0.0
19
- result_second = 0.0
20
- length.times do |column|
21
- ending_weight_first = first.background_sum ** (length - column - 1)
22
- ending_weight_second = second.background_sum ** (length - column - 1)
23
- already_enough_first = threshold_first - first.worst_suffix[column + 1]
24
- already_enough_second = threshold_second - second.worst_suffix[column + 1]
25
- least_sufficient_first = threshold_first - first.best_suffix[column + 1]
26
- least_sufficient_second = threshold_second - second.best_suffix[column + 1]
27
-
28
- new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=[0,0]}}
29
- scores.each do |score_first, second_scores|
30
- second_scores.each do |score_second, count|
31
- 4.times do |letter|
32
- new_score_first = score_first + first.matrix[column][letter]
33
- if new_score_first >= already_enough_first
34
- new_score_second = score_second + second.matrix[column][letter]
35
- if new_score_second >= already_enough_second
36
- result_first += count[0] * first.background[letter] * ending_weight_first
37
- result_second += count[1] * second.background[letter] * ending_weight_second
38
- elsif new_score_second >= least_sufficient_second
39
- new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
40
- new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
41
- end
42
- elsif new_score_first >= least_sufficient_first
43
- new_score_second = score_second + second.matrix[column][letter]
44
- if new_score_second >= least_sufficient_second
45
- new_scores[new_score_first][new_score_second][0] += count[0] * first.background[letter]
46
- new_scores[new_score_first][new_score_second][1] += count[1] * second.background[letter]
47
- end
48
- end
49
- end
50
- end
23
+
24
+ # block has form: {|score,letter| contribution to count by `letter` with `score` }
25
+ def get_counts(threshold_first, threshold_second, &count_contribution_block)
26
+ # scores_on_first_pwm, scores_on_second_pwm --> count
27
+ scores = { 0 => {0 => 1} }
28
+ length.times do |column|
29
+ new_scores = recalc_score_hash(scores,
30
+ @first.matrix[column], @second.matrix[column],
31
+ threshold_first - first.best_suffix(column + 1),
32
+ threshold_second - second.best_suffix(column + 1), &count_contribution_block)
33
+ scores.replace(new_scores)
34
+ if defined?(MaxHashSizeDouble) && scores.inject(0){|sum,hsh|sum + hsh.size} > MaxHashSizeDouble
35
+ raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
51
36
  end
52
- raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
53
- scores = new_scores
54
37
  end
55
- [result_first, result_second]
38
+ scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
56
39
  end
57
-
58
- def counts_for_two_matrices_with_same_probabilities(threshold_first, threshold_second)
59
- scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
60
- result = 0.0
61
- background = first.background
62
- length.times do |column|
63
- ending_weight = first.background_sum ** (length - column - 1)
64
- already_enough_first = threshold_first - first.worst_suffix[column + 1]
65
- already_enough_second = threshold_second - second.worst_suffix[column + 1]
66
- least_sufficient_first = threshold_first - first.best_suffix[column + 1]
67
- least_sufficient_second = threshold_second - second.best_suffix[column + 1]
68
40
 
69
- new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
70
- scores.each do |score_first, second_scores|
71
- second_scores.each do |score_second, count|
72
- 4.times do |letter|
73
- new_score_first = score_first + first.matrix[column][letter]
74
- if new_score_first >= already_enough_first
75
- new_score_second = score_second + second.matrix[column][letter]
76
- if new_score_second >= already_enough_second
77
- result += count * background[letter] * ending_weight
78
- elsif new_score_second >= least_sufficient_second
79
- new_scores[new_score_first][new_score_second] += count * background[letter]
80
- end
81
- elsif new_score_first >= least_sufficient_first
82
- new_score_second = score_second + second.matrix[column][letter]
83
- if new_score_second >= least_sufficient_second
84
- new_scores[new_score_first][new_score_second] += count * background[letter]
85
- end
86
- end
87
- end
88
- end
89
- end
90
- raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_same_probabilities' if new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
91
- scores = new_scores
92
- end
93
- [result, result]
94
- end
95
-
96
-
97
- def common_words_for_two_matrices(threshold_first, threshold_second)
98
- scores = { 0 => {0 => 1} } # scores_on_first_pwm, scores_on_second_pwm --> count_on_first_probabilities, count_on_second_probabilities
99
- result = 0
100
- length.times do |column|
101
- ending_weight = 4 ** (length - column - 1)
102
- already_enough_first = threshold_first - first.worst_suffix[column + 1]
103
- already_enough_second = threshold_second - second.worst_suffix[column + 1]
104
- least_sufficient_first = threshold_first - first.best_suffix[column + 1]
105
- least_sufficient_second = threshold_second - second.best_suffix[column + 1]
41
+ # wouldn't work without count_contribution_block
42
+ def recalc_score_hash(scores, first_column, second_column, least_sufficient_first, least_sufficient_second)
43
+ new_scores = Hash.new{|h,k| h[k] = Hash.new(0)}
44
+ scores.each do |score_first, second_scores|
45
+ second_scores.each do |score_second, count|
106
46
 
107
- new_scores = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=0} }
108
- scores.each do |score_first, second_scores|
109
- second_scores.each do |score_second, count|
110
- 4.times do |letter|
111
- new_score_first = score_first + first.matrix[column][letter]
112
- if new_score_first >= already_enough_first
113
- new_score_second = score_second + second.matrix[column][letter]
114
- if new_score_second >= already_enough_second
115
- result += count * ending_weight
116
- elsif new_score_second >= least_sufficient_second
117
- new_scores[new_score_first][new_score_second] += count
118
- end
119
- elsif new_score_first >= least_sufficient_first
120
- new_score_second = score_second + second.matrix[column][letter]
121
- if new_score_second >= least_sufficient_second
122
- new_scores[new_score_first][new_score_second] += count
123
- end
47
+ 4.times do |letter|
48
+ new_score_first = score_first + first_column[letter]
49
+ if new_score_first >= least_sufficient_first
50
+ new_score_second = score_second + second_column[letter]
51
+ if new_score_second >= least_sufficient_second
52
+ new_scores[new_score_first][new_score_second] += yield(count, letter)
124
53
  end
125
54
  end
126
55
  end
56
+
127
57
  end
128
-
129
- raise 'Hash overflow in Macroape::AlignedPairIntersection#common_words_for_two_matrices' if defined? MaxHashSizeDouble and new_scores.inject(0){|sum,hsh|sum+hsh.size} > MaxHashSizeDouble
130
- scores = new_scores
131
58
  end
132
- [result, result]
59
+ new_scores
133
60
  end
134
-
61
+
135
62
  end
136
63
  end
@@ -11,10 +11,10 @@ module Macroape
11
11
  @infos[pwm.name] = info
12
12
  end
13
13
  def ==(other)
14
- @rough_discretization == other.rough_discretization &&
15
- @precise_discretization == other.precise_discretization &&
16
- @background == other.background &&
17
- @pvalues == other.pvalues &&
14
+ @rough_discretization == other.rough_discretization &&
15
+ @precise_discretization == other.precise_discretization &&
16
+ @background == other.background &&
17
+ @pvalues == other.pvalues &&
18
18
  @pwms == other.pwms &&
19
19
  @infos == other.infos
20
20
  end
@@ -3,7 +3,7 @@ module Bioinform
3
3
  def threshold(pvalue)
4
4
  thresholds(pvalue){|_, thresh, _| return thresh }
5
5
  end
6
-
6
+
7
7
  def thresholds(*pvalues)
8
8
  thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
9
9
  threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
@@ -11,31 +11,30 @@ module Bioinform
11
11
  yield pvalue, threshold, real_pvalue
12
12
  end
13
13
  end
14
-
14
+
15
15
  def count_distribution_under_pvalue(max_pvalue)
16
- count_distribution={}
16
+ cnt_distribution = {}
17
17
  look_for_count = max_pvalue * vocabulary_volume
18
- until count_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
19
- count_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
18
+ until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
19
+ cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
20
20
  max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
21
21
  end
22
-
23
- count_distribution
22
+
23
+ cnt_distribution
24
24
  end
25
-
26
-
25
+
26
+
27
27
  # ret-value: hash {pvalue => [thresholds, counts]}
28
28
  # thresholds = left_threshold .. right_threshold (left_threshold < right_threshold)
29
29
  # counts = left_count .. right_count (left_count > right_count)
30
30
  def thresholds_by_pvalues(*pvalues)
31
- count_distribution = count_distribution_under_pvalue(pvalues.max)
32
- sorted_scores = count_distribution.sort.reverse
31
+ sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
33
32
  scores = sorted_scores.map{|score,count| score}
34
33
  counts = sorted_scores.map{|score,count| count}
35
34
  partial_sums = counts.partial_sums
36
-
35
+
37
36
  results = {}
38
-
37
+
39
38
  pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
40
39
  pvalue_counts.map do |pvalue,look_for_count|
41
40
  ind = partial_sums.index{|sum| sum >= look_for_count}
@@ -46,18 +45,19 @@ module Bioinform
46
45
 
47
46
  results
48
47
  end
49
-
48
+
50
49
  def count_distribution_after_threshold(threshold)
50
+ return @count_distribution.select{|score, count| score >= threshold} if @count_distribution
51
51
  scores = { 0 => 1 }
52
52
  length.times do |column|
53
- scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix[column + 1])
53
+ scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
54
54
  raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold' if defined? MaxHashSizeSingle and scores.size > MaxHashSizeSingle
55
55
  end
56
56
  scores
57
57
  end
58
-
58
+
59
59
  def count_distribution
60
- count_distribution_after_threshold(worst_score)
60
+ @count_distribution ||= count_distribution_after_threshold(worst_score)
61
61
  end
62
62
 
63
63
  def recalc_score_hash(scores, column, least_sufficient)
@@ -72,6 +72,16 @@ module Bioinform
72
72
  end
73
73
  new_scores
74
74
  end
75
-
75
+
76
+ def counts_by_thresholds(*thresholds)
77
+ scores = count_distribution_after_threshold(thresholds.min)
78
+ thresholds.map{ |threshold|
79
+ scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum}
80
+ }
81
+ end
82
+
83
+ def pvalue_by_threshold(threshold)
84
+ counts_by_thresholds(threshold).first / vocabulary_volume
85
+ end
76
86
  end
77
87
  end