macroape 3.3.7 → 3.3.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/README.md +2 -2
  2. data/Rakefile.rb +6 -6
  3. data/TODO.txt +23 -3
  4. data/benchmark/similarity_benchmark.rb +18 -18
  5. data/lib/macroape/aligned_pair_intersection.rb +4 -4
  6. data/lib/macroape/cli/align_motifs.rb +34 -28
  7. data/lib/macroape/cli/eval_alignment.rb +73 -47
  8. data/lib/macroape/cli/eval_similarity.rb +65 -40
  9. data/lib/macroape/cli/find_pvalue.rb +30 -34
  10. data/lib/macroape/cli/find_threshold.rb +52 -41
  11. data/lib/macroape/cli/preprocess_collection.rb +68 -58
  12. data/lib/macroape/cli/scan_collection.rb +89 -73
  13. data/lib/macroape/cli.rb +184 -1
  14. data/lib/macroape/counting.rb +31 -5
  15. data/lib/macroape/pwm_compare.rb +8 -2
  16. data/lib/macroape/pwm_compare_aligned.rb +15 -10
  17. data/lib/macroape/version.rb +2 -1
  18. data/macroape.gemspec +2 -1
  19. data/spec/count_distribution_spec.rb +11 -11
  20. data/test/align_motifs_test.rb +16 -4
  21. data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
  22. data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
  23. data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  24. data/test/data/KLF4_f2_scan_results_all.txt +1 -2
  25. data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
  26. data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
  27. data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
  28. data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  29. data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
  30. data/test/data/collection_pcm_without_thresholds.yaml +186 -183
  31. data/test/data/collection_without_thresholds.yaml +186 -183
  32. data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
  33. data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
  34. data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
  35. data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
  36. data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
  37. data/test/data/test_collection.yaml +179 -176
  38. data/test/data/test_collection_weak.yaml +214 -0
  39. data/test/eval_alignment_test.rb +97 -21
  40. data/test/eval_similarity_test.rb +104 -26
  41. data/test/find_pvalue_test.rb +22 -9
  42. data/test/find_threshold_test.rb +76 -25
  43. data/test/preprocess_collection_test.rb +16 -21
  44. data/test/scan_collection_test.rb +26 -14
  45. data/test/test_helper.rb +96 -12
  46. metadata +44 -24
data/README.md CHANGED
@@ -23,8 +23,8 @@ Or install it yourself as:
23
23
  MacroAPE have 7 command line tools:
24
24
 
25
25
  ### Tools for calculating thresholds and pvalues:
26
- * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
27
- * find_pvalue \<PWM file\> \<threshold\>
26
+ * find_threshold \<PWM file\> [\<pvalue(by default: 0.0005)\>...]
27
+ * find_pvalue \<PWM file\> \<threshold\>...
28
28
 
29
29
  ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
30
30
  * eval_similarity \<first PWM file\> \<second PWM file\>
data/Rakefile.rb CHANGED
@@ -8,7 +8,7 @@ namespace :spec do
8
8
  t.libs << "test"
9
9
  t.test_files = FileList['test/*_test.rb']
10
10
  t.verbose = true
11
- end
11
+ end
12
12
  RSpec::Core::RakeTask.new
13
13
  end
14
14
 
@@ -19,23 +19,23 @@ namespace :benchmark do
19
19
  task :run do
20
20
  require 'open3'
21
21
  time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
22
- File.open('benchmark/benchmark.log','a') do |f|
23
- f.puts "=========================================================\n#{time}\n"
22
+ File.open('benchmark/benchmark.log','a') do |f|
23
+ f.puts "=========================================================\n#{time}\n"
24
24
  Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
25
25
  Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
26
26
  benchmark_name = File.basename(benchmark_filename)
27
27
  out_str = out.read
28
28
  err_str = err.read
29
-
29
+
30
30
  benchmark_infos = "-------------------\n#{benchmark_name}:\n#{out_str}\n"
31
31
  benchmark_infos_to_file = benchmark_infos
32
32
  puts benchmark_infos
33
-
33
+
34
34
  if err_str && !err_str.empty?
35
35
  STDERR.puts(err_str)
36
36
  benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
37
37
  end
38
-
38
+
39
39
  # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
40
40
  f.puts benchmark_infos_to_file
41
41
  end
data/TODO.txt CHANGED
@@ -1,3 +1,23 @@
1
+ ToDo:
2
+ 6)
3
+ # TODO: FIX: this test fails due to floating point precision error: estimated threshold is -19.0418 but '-19.0418'.to_f * 10000 = -190417.99999999997
4
+ # A workaround exists: we can use fractions, i.e. ('-19.0418'.to_r * 10000).to_f = -190418.0 but it obscures code and being used uncarefully can involve huge slowdown.
5
+ # I think, it'd be used only at input to workaround discretization issue
6
+ #
7
+ # def test_process_large_pvalue_floating_point_error
8
+ # pvalue, threshold, real_pvalue = nil, nil, nil
9
+ # assert_nothing_raised {
10
+ # pvalue, threshold, real_pvalue = Helpers.find_threshold_output('KLF4_f2.pwm -p 0.8').strip.split("\t")
11
+ # }
12
+ # assert_equal '0.8', pvalue
13
+ # assert_equal Helpers.obtain_pvalue_by_threshold("KLF4_f2.pwm #{threshold}"), real_pvalue
14
+ # end
15
+ 7)thresholds and thresholds_weak should return a collection (Array or Hash) when block not given
16
+ merge this two methods into one parametrized method
17
+ 8)(TODO: for theoretically consistency, while making small inconsistences to old calculations)
18
+ When we work with strong threshold, we round matrix up(in order to overrate threshold comparing to real thus taking underrated pvalue) and take upper bound of discrete-thresholds fork.
19
+ When we are estimating lower bound of threshold (weak threshold) we take lower bound of fork of discrete thresholds. But we should ALSO (not done yet) take matrix discreted down! This'd allow us give exact answer on a question in which range real threshold should lay with given P-value, now we correctly estimate only lower bound of threshold(upper bound of P-value)
20
+
1
21
  Specs and tests:
2
22
  create spec on use of MaxHashSize, MaxHashSizeDouble
3
23
  create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
@@ -8,11 +28,11 @@ Ideas to increase perfomance:
8
28
  - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
9
29
  - Create JAVA extension for alignment_intersection methods in order to increase perfomance
10
30
  - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
11
-
12
- Usability issues:
31
+
32
+ Usability issues:
13
33
  make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
14
34
 
15
35
  remove .stdin placeholder. Use tty? method instead
16
36
 
17
37
  use OptionParser or docopt
18
- make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
38
+ make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)z
@@ -3,15 +3,15 @@ require_relative 'benchmark_helper'
3
3
  class TaskToBenchmark
4
4
  def setup
5
5
  @matrix_first = "KLF4_f2.xml
6
- 0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
7
- -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
8
- -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
9
- -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
10
- -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
11
- -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
12
- -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
13
- -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
14
- -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
6
+ 0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
7
+ -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
8
+ -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
9
+ -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
10
+ -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
11
+ -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
12
+ -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
13
+ -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
14
+ -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
15
15
  -1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
16
16
  "
17
17
 
@@ -27,25 +27,25 @@ class TaskToBenchmark
27
27
  -0.4450938582835542 -2.2510053061629707 1.126543157436868 -1.7780413702431377
28
28
  -1.1896356092245055 -1.2251832285630033 1.163676006374752 -1.6080243648157357
29
29
  -0.5166047365590577 0.7641033353626651 -0.28626775700282125 -0.6825482097865606"
30
-
30
+
31
31
  @pvalue = 0.0005
32
- @discretization = 10
32
+ @discretization = 1
33
33
  @first_background, @second_background = [1,1,1,1], [1,1,1,1]
34
-
35
- @pwm_first = Bioinform::PWM.new(@matrix_first).background(@first_background).discrete(@discretization)
36
- @pwm_second = Bioinform::PWM.new(@matrix_second).background(@second_background).discrete(@discretization)
34
+
35
+ @pwm_first = Bioinform::PWM.new(@matrix_first).set_parameters(background: @first_background).discrete(@discretization)
36
+ @pwm_second = Bioinform::PWM.new(@matrix_second).set_parameters(background: @second_background).discrete(@discretization)
37
37
  @cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
38
+ @first_threshold = @pwm_first.threshold(@pvalue)
39
+ @second_threshold = @pwm_second.threshold(@pvalue)
38
40
  self
39
41
  end
40
42
 
41
43
  def run
42
- first_threshold = @pwm_first.threshold(@pvalue)
43
- second_threshold = @pwm_second.threshold(@pvalue)
44
- info = @cmp.jaccard(first_threshold, second_threshold)
44
+ info = @cmp.jaccard(@first_threshold, @second_threshold)
45
45
  end
46
46
  end
47
47
 
48
- benchmark_result = 10.times.collect do
48
+ benchmark_result = 100.times.collect do
49
49
  task_to_benchmark = TaskToBenchmark.new.setup
50
50
  Benchmark.measure{ task_to_benchmark.run }
51
51
  end.inject(&:+)
@@ -18,8 +18,8 @@ module Macroape
18
18
  [result, result]
19
19
  end
20
20
  end
21
-
22
-
21
+
22
+
23
23
  # block has form: {|score,letter| contribution to count by `letter` with `score` }
24
24
  def get_counts(threshold_first, threshold_second, &count_contribution_block)
25
25
  # scores_on_first_pwm, scores_on_second_pwm --> count
@@ -34,7 +34,7 @@ module Macroape
34
34
  raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
35
35
  end
36
36
  end
37
- scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
37
+ scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
38
38
  end
39
39
 
40
40
  # wouldn't work without count_contribution_block
@@ -52,7 +52,7 @@ module Macroape
52
52
  end
53
53
  end
54
54
  end
55
-
55
+
56
56
  end
57
57
  end
58
58
  new_scores
@@ -1,49 +1,55 @@
1
+ require 'docopt'
1
2
  require_relative '../../macroape'
2
3
 
3
4
  module Macroape
4
5
  module CLI
5
6
  module AlignMotifs
6
-
7
+
7
8
  def self.main(argv)
8
- help_string = %q{
9
- Usage:
10
- ruby align_motifs pwm1_file pwm2_file pwm3_file
11
- ruby align_motifs pcm1_file pcm2_file pcm3_file --pcm
12
- Output:
13
- pwm_1_file shift_1 orientation_1
14
- pwm_2_file shift_2 orientation_2
15
- pwm_3_file shift_3 orientation_3
16
- }
17
-
18
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
19
- STDERR.puts help_string
20
- exit
21
- end
9
+ doc = <<-DOCOPT.strip_doc
10
+ Align motifs tool.
11
+ It takes motifs and builds alignment of each motif to the first (leader) motif.
12
+
13
+ Output has format:
14
+ pwm_file_1 shift_1 orientation_1
15
+ pwm_file_2 shift_2 orientation_2
16
+ pwm_file_3 shift_3 orientation_3
17
+
18
+ Usage:
19
+ align_motifs [options] <pm-files>...
22
20
 
21
+ Options:
22
+ -h --help Show this screen.
23
+ --pcm Use PCMs instead of PWMs as input
24
+ DOCOPT
23
25
 
24
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
25
- leader = argv.shift
26
+ options = Docopt::docopt(doc, argv: argv)
27
+
28
+ data_model = options['--pcm'] ? Bioinform::PCM : Bioinform::PWM
29
+ motif_files = options['<pm-files>']
30
+ leader = motif_files.first
26
31
  background = [1,1,1,1]
27
- discretization = 10
32
+ discretization = 1
28
33
  pvalue = 0.0005
29
-
34
+
30
35
  shifts = {leader => [0,:direct]}
31
- pwm_first = data_model.new(File.read(leader)).to_pwm.set_parameters(background: background).discrete!(discretization)
32
- argv.each do |motif_name|
33
- pwm_second = data_model.new(File.read(motif_name)).to_pwm.set_parameters(background: background).discrete!(discretization)
34
- cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
35
- info = cmp.jaccard_by_pvalue(pvalue)
36
+ pwm_first = data_model.new(File.read(leader)).to_pwm
37
+ pwm_first.set_parameters(background: background).discrete!(discretization)
38
+ motif_files[1..-1].each do |motif_name|
39
+ pwm_second = data_model.new(File.read(motif_name)).to_pwm
40
+ pwm_second.set_parameters(background: background).discrete!(discretization)
41
+ info = Macroape::PWMCompare.new(pwm_first, pwm_second).jaccard_by_pvalue(pvalue)
36
42
  shifts[motif_name] = [info[:shift], info[:orientation]]
37
43
  end
38
-
44
+
39
45
  shifts.each do |motif_name, (shift,orientation)|
40
46
  puts "#{motif_name}\t#{shift}\t#{orientation}"
41
47
  end
42
48
 
43
- rescue => err
44
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
49
+ rescue Docopt::Exit => e
50
+ puts e.message
45
51
  end
46
-
52
+
47
53
  end
48
54
  end
49
55
  end
@@ -3,48 +3,40 @@ require_relative '../../macroape'
3
3
  module Macroape
4
4
  module CLI
5
5
  module EvalAlignment
6
-
6
+
7
7
  def self.main(argv)
8
- help_string = %q{
9
- Command-line format:
10
- ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
11
- type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
12
- or in linux
13
- cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
14
-
15
- Options:
16
- [-p <P-value>]
17
- [-d <discretization level>]
18
- [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
19
-
20
- Output format:
21
- <jaccard similarity coefficient>
22
- <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
23
- <aligned 1st matrix>
24
- <aligned 2nd matrix>
25
- <shift> <orientation>
26
-
27
- Examples:
28
- ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
29
- or on windows
30
- type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
31
- or in linux
32
- cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
33
- }
34
-
35
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
36
- STDERR.puts help_string
8
+ doc = <<-EOS.strip_doc
9
+ Command-line format:
10
+ #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
11
+
12
+ Options:
13
+ [-p <P-value>]
14
+ [-d <discretization level>]
15
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
+ [--first-threshold <threshold for the first matrix>]
19
+ [--second-threshold <threshold for the second matrix>]
20
+
21
+ Examples:
22
+ #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
+ #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat 3 revcomp
24
+ EOS
25
+
26
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
27
+ STDERR.puts doc
37
28
  exit
38
29
  end
39
30
 
40
31
  pvalue = 0.0005
41
- discretization = 10
32
+ discretization = 10.0
42
33
 
43
34
  first_background = [1,1,1,1]
44
35
  second_background = [1,1,1,1]
45
- max_hash_size = 1000000
46
- max_pair_hash_size = 1000
47
-
36
+ max_hash_size = 10000000
37
+ max_pair_hash_size = 10000
38
+ pvalue_boundary = :upper
39
+
48
40
  data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
49
41
 
50
42
  first_file = argv.shift
@@ -53,9 +45,9 @@ module Macroape
53
45
  shift = argv.shift
54
46
  orientation = argv.shift
55
47
 
56
- raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
57
- raise 'You\'d specify shift' unless shift
58
- raise 'You\'d specify orientation' unless orientation
48
+ raise 'You should specify two input sources (each is filename or .stdin)' unless first_file and second_file
49
+ raise 'You should specify shift' unless shift
50
+ raise 'You should specify orientation' unless orientation
59
51
 
60
52
  shift = shift.to_i
61
53
  orientation = orientation.to_sym
@@ -76,16 +68,23 @@ module Macroape
76
68
  pvalue = argv.shift.to_f
77
69
  when '-d'
78
70
  discretization = argv.shift.to_f
79
- when '-m'
71
+ when '--max-hash-size'
80
72
  max_hash_size = argv.shift.to_i
81
- when '-md'
73
+ when '--max-2d-hash-size'
82
74
  max_pair_hash_size = argv.shift.to_i
83
75
  when '-b'
84
- second_background = first_background = argv.shift(4).map(&:to_f)
76
+ second_background = first_background = argv.shift.split(',').map(&:to_f)
85
77
  when '-b1'
86
- first_background = argv.shift(4).map(&:to_f)
78
+ first_background = argv.shift.split(',').map(&:to_f)
87
79
  when '-b2'
88
- second_background = argv.shift(4).map(&:to_f)
80
+ second_background = argv.shift.split(',').map(&:to_f)
81
+ when '--boundary'
82
+ pvalue_boundary = argv.shift.to_sym
83
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
84
+ when '--first-threshold'
85
+ predefined_threshold_first = argv.shift.to_f
86
+ when '--second-threshold'
87
+ predefined_threshold_second = argv.shift.to_f
89
88
  end
90
89
  end
91
90
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
@@ -95,7 +94,7 @@ module Macroape
95
94
  input = $stdin.read
96
95
  parser = data_model.choose_parser(input).new(input)
97
96
  end
98
-
97
+
99
98
  if first_file == '.stdin'
100
99
  input_first = parser.parse
101
100
  else
@@ -111,18 +110,45 @@ module Macroape
111
110
  input_second = File.read(second_file)
112
111
  end
113
112
  pwm_second = data_model.new(input_second).to_pwm
114
-
113
+
115
114
  pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
116
115
  pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
117
116
 
118
117
  cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
119
118
 
120
- info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
119
+ if predefined_threshold_first
120
+ threshold_first = predefined_threshold_first * discretization
121
+ else
122
+ if pvalue_boundary == :lower
123
+ threshold_first = pwm_first.threshold(pvalue)
124
+ else
125
+ threshold_first = pwm_first.weak_threshold(pvalue)
126
+ end
127
+ end
121
128
 
122
- puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
129
+ if predefined_threshold_second
130
+ threshold_second = predefined_threshold_second * discretization
131
+ else
132
+ if pvalue_boundary == :lower
133
+ threshold_second = pwm_second.threshold(pvalue)
134
+ else
135
+ threshold_second = pwm_second.weak_threshold(pvalue)
136
+ end
137
+ end
138
+ info = cmp.alignment_infos.merge( cmp.jaccard(threshold_first, threshold_second) )
139
+ info.merge!(predefined_threshold_first: predefined_threshold_first,
140
+ predefined_threshold_second: predefined_threshold_second,
141
+ threshold_first: threshold_first / discretization,
142
+ threshold_second: threshold_second / discretization,
143
+ discretization: discretization,
144
+ first_background: first_background,
145
+ second_background: second_background,
146
+ requested_pvalue: pvalue,
147
+ pvalue_boundary: pvalue_boundary)
148
+ puts Helper.similarity_info_string(info)
123
149
 
124
150
  rescue => err
125
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
151
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
126
152
  end
127
153
 
128
154
  end
@@ -3,54 +3,44 @@ require_relative '../../macroape'
3
3
  module Macroape
4
4
  module CLI
5
5
  module EvalSimilarity
6
-
6
+
7
7
  def self.main(argv)
8
- help_string = %q{
8
+ doc = <<-EOS.strip_doc
9
9
  Command-line format:
10
- ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
11
- or on windows
12
- type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
13
- or in linux
14
- cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
10
+ #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
15
11
 
16
12
  Options:
17
13
  [-p <P-value>]
18
14
  [-d <discretization level>]
19
- [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
20
-
21
- Output has format:
22
- <jaccard similarity coefficient>
23
- <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
24
- <optimal alignment, the 1st matrix>
25
- <optimal alignment, the 2nd matrix>
26
- <shift> <orientation>
15
+ [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
16
+ [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
17
+ [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
18
+ [--first-threshold <threshold for the first matrix>]
19
+ [--second-threshold <threshold for the second matrix>]
27
20
 
28
21
  Examples:
29
- ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
30
- or on windows
31
- type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
32
- or in linux
33
- cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
34
- }
35
-
36
- if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
37
- STDERR.puts help_string
22
+ #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
23
+ EOS
24
+
25
+ if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
26
+ STDERR.puts doc
38
27
  exit
39
28
  end
40
29
 
41
30
  pvalue = 0.0005
42
- discretization = 10
31
+ discretization = 10.0
43
32
 
44
33
  first_background = [1,1,1,1]
45
34
  second_background = [1,1,1,1]
46
35
 
47
- max_hash_size = 1000000
48
- max_pair_hash_size = 1000
36
+ max_hash_size = 10000000
37
+ max_pair_hash_size = 10000
38
+ pvalue_boundary = :upper
49
39
 
50
- data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
40
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
51
41
  first_file = argv.shift
52
42
  second_file = argv.shift
53
- raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
43
+ raise 'You should specify two input files' unless first_file and second_file
54
44
 
55
45
  until argv.empty?
56
46
  case argv.shift
@@ -58,16 +48,23 @@ module Macroape
58
48
  pvalue = argv.shift.to_f
59
49
  when '-d'
60
50
  discretization = argv.shift.to_f
61
- when '-m'
51
+ when '--max-hash-size'
62
52
  max_hash_size = argv.shift.to_i
63
- when '-md'
53
+ when '--max-2d-hash-size'
64
54
  max_pair_hash_size = argv.shift.to_i
65
55
  when '-b'
66
- second_background = first_background = argv.shift(4).map(&:to_f)
56
+ second_background = first_background = argv.shift.split(',').map(&:to_f)
67
57
  when '-b1'
68
- first_background = argv.shift(4).map(&:to_f)
58
+ first_background = argv.shift.split(',').map(&:to_f)
69
59
  when '-b2'
70
- second_background = argv.shift(4).map(&:to_f)
60
+ second_background = argv.shift.split(',').map(&:to_f)
61
+ when '--boundary'
62
+ pvalue_boundary = argv.shift.to_sym
63
+ raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
64
+ when '--first-threshold'
65
+ predefined_threshold_first = argv.shift.to_f
66
+ when '--second-threshold'
67
+ predefined_threshold_second = argv.shift.to_f
71
68
  end
72
69
  end
73
70
  raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
@@ -77,7 +74,7 @@ module Macroape
77
74
  input = $stdin.read
78
75
  parser = data_model.choose_parser(input).new(input)
79
76
  end
80
-
77
+
81
78
  if first_file == '.stdin'
82
79
  input_first = parser.parse
83
80
  else
@@ -93,20 +90,48 @@ module Macroape
93
90
  input_second = File.read(second_file)
94
91
  end
95
92
  pwm_second = data_model.new(input_second).to_pwm
96
-
93
+
97
94
  pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
98
95
  pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
99
96
 
100
97
  cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
101
98
 
102
- info = cmp.jaccard_by_pvalue(pvalue)
99
+ if predefined_threshold_first
100
+ threshold_first = predefined_threshold_first * discretization
101
+ else
102
+ if pvalue_boundary == :lower
103
+ threshold_first = pwm_first.threshold(pvalue)
104
+ else
105
+ threshold_first = pwm_first.weak_threshold(pvalue)
106
+ end
107
+ end
108
+
109
+ if predefined_threshold_second
110
+ threshold_second = predefined_threshold_second * discretization
111
+ else
112
+ if pvalue_boundary == :lower
113
+ threshold_second = pwm_second.threshold(pvalue)
114
+ else
115
+ threshold_second = pwm_second.weak_threshold(pvalue)
116
+ end
117
+ end
103
118
 
104
- puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
119
+ info = cmp.jaccard(threshold_first, threshold_second)
120
+ info.merge!(predefined_threshold_first: predefined_threshold_first,
121
+ predefined_threshold_second: predefined_threshold_second,
122
+ threshold_first: threshold_first.to_f / discretization,
123
+ threshold_second: threshold_second.to_f / discretization,
124
+ discretization: discretization,
125
+ first_background: first_background,
126
+ second_background: second_background,
127
+ requested_pvalue: pvalue,
128
+ pvalue_boundary: pvalue_boundary)
129
+ puts Helper.similarity_info_string(info)
105
130
 
106
131
  rescue => err
107
- STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
132
+ STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
108
133
  end
109
-
134
+
110
135
  end
111
136
  end
112
137
  end