RubyGems - macroape - Versions diffs - 3.3.7 → 3.3.8 - Mend

macroape 3.3.7 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/README.md +2 -2
data/Rakefile.rb +6 -6
data/TODO.txt +23 -3
data/benchmark/similarity_benchmark.rb +18 -18
data/lib/macroape/aligned_pair_intersection.rb +4 -4
data/lib/macroape/cli/align_motifs.rb +34 -28
data/lib/macroape/cli/eval_alignment.rb +73 -47
data/lib/macroape/cli/eval_similarity.rb +65 -40
data/lib/macroape/cli/find_pvalue.rb +30 -34
data/lib/macroape/cli/find_threshold.rb +52 -41
data/lib/macroape/cli/preprocess_collection.rb +68 -58
data/lib/macroape/cli/scan_collection.rb +89 -73
data/lib/macroape/cli.rb +184 -1
data/lib/macroape/counting.rb +31 -5
data/lib/macroape/pwm_compare.rb +8 -2
data/lib/macroape/pwm_compare_aligned.rb +15 -10
data/lib/macroape/version.rb +2 -1
data/macroape.gemspec +2 -1
data/spec/count_distribution_spec.rb +11 -11
data/test/align_motifs_test.rb +16 -4
data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
data/test/data/KLF4_f2_scan_results_all.txt +1 -2
data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
data/test/data/collection_pcm_without_thresholds.yaml +186 -183
data/test/data/collection_without_thresholds.yaml +186 -183
data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
data/test/data/test_collection.yaml +179 -176
data/test/data/test_collection_weak.yaml +214 -0
data/test/eval_alignment_test.rb +97 -21
data/test/eval_similarity_test.rb +104 -26
data/test/find_pvalue_test.rb +22 -9
data/test/find_threshold_test.rb +76 -25
data/test/preprocess_collection_test.rb +16 -21
data/test/scan_collection_test.rb +26 -14
data/test/test_helper.rb +96 -12
metadata +44 -24

data/README.md CHANGED Viewed

@@ -23,8 +23,8 @@ Or install it yourself as:
   MacroAPE have 7 command line tools:
 ### Tools for calculating thresholds and pvalues:
-  * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
-  * find_pvalue \<PWM file\> \<threshold\>
+  * find_threshold \<PWM file\> [\<pvalue(by default: 0.0005)\>...]
+  * find_pvalue \<PWM file\> \<threshold\>...
 ### Tools for evaluating Jaccard similarity measure in the best alignment and in certain alignment:
   * eval_similarity \<first PWM file\> \<second PWM file\>

data/Rakefile.rb CHANGED Viewed

@@ -8,7 +8,7 @@ namespace :spec do
     t.libs << "test"
     t.test_files = FileList['test/*_test.rb']
     t.verbose = true
-  end
+  end
   RSpec::Core::RakeTask.new
 end
@@ -19,23 +19,23 @@ namespace :benchmark do
   task :run do
     require 'open3'
     time = Time.now.strftime("%d-%m-%Y, %H:%M:%S sec")
-    File.open('benchmark/benchmark.log','a') do |f|
-      f.puts "=========================================================\n#{time}\n"
+    File.open('benchmark/benchmark.log','a') do |f|
+      f.puts "=========================================================\n#{time}\n"
       Dir.glob('benchmark/*_benchmark.rb') do |benchmark_filename|
         Open3.popen3("ruby -I ./benchmark #{benchmark_filename}") do |inp, out, err, wait_thr|
           benchmark_name = File.basename(benchmark_filename)
           out_str = out.read
           err_str = err.read
           benchmark_infos =  "-------------------\n#{benchmark_name}:\n#{out_str}\n"
           benchmark_infos_to_file = benchmark_infos
           puts benchmark_infos
           if err_str && !err_str.empty?
             STDERR.puts(err_str)
             benchmark_infos_to_file = benchmark_infos + "\n!!!\nError:\n#{err_str}\n"
           end
           # add info about git commit (if everything is commited, otherwise to commit one should use special option -c)
           f.puts benchmark_infos_to_file
         end

data/TODO.txt CHANGED Viewed

@@ -1,3 +1,23 @@
+ToDo:
+6)
+# TODO:  FIX: this test fails due to floating point precision error: estimated threshold is -19.0418 but '-19.0418'.to_f * 10000 = -190417.99999999997
+# A workaround exists: we can use fractions, i.e. ('-19.0418'.to_r * 10000).to_f = -190418.0 but it obscures code and being used uncarefully can involve huge slowdown.
+# I think, it'd be used only at input to workaround discretization issue
+#
+#  def test_process_large_pvalue_floating_point_error
+#    pvalue, threshold, real_pvalue = nil, nil, nil
+#    assert_nothing_raised {
+#      pvalue, threshold, real_pvalue = Helpers.find_threshold_output('KLF4_f2.pwm -p 0.8').strip.split("\t")
+#    }
+#    assert_equal '0.8', pvalue
+#    assert_equal Helpers.obtain_pvalue_by_threshold("KLF4_f2.pwm #{threshold}"), real_pvalue
+#  end
+7)thresholds and thresholds_weak should return a collection (Array or Hash) when block not given
+  merge this two methods into one parametrized method
+8)(TODO: for theoretically consistency, while making small inconsistences to old calculations)
+  When we work with strong threshold, we round matrix up(in order to overrate threshold comparing to real thus taking underrated pvalue) and take upper bound of discrete-thresholds fork.
+  When we are estimating lower bound of threshold (weak threshold) we take lower bound of fork of discrete thresholds. But we should ALSO (not done yet) take matrix discreted down! This'd allow us give exact answer on a question in which range real threshold should lay with given P-value, now we correctly estimate only lower bound of threshold(upper bound of P-value)
 Specs and tests:
   create spec on use of MaxHashSize, MaxHashSizeDouble
   create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
@@ -8,11 +28,11 @@ Ideas to increase perfomance:
   - (?) Make rearrangment of rows by DIC decreasing in aligned pair of matrices before counting
   - Create JAVA extension for alignment_intersection methods in order to increase perfomance
   - Possibly algorithm shouldn't use hash but had two iterations: at first it determines possible hash scores for every length(if worst suffix is always zero, its flat space of scores at all pwm prefix lengths) of each pwm separately. And after that we can work with arrays which use such scores as indices via additional substructure
-Usability issues:
+Usability issues:
   make preprocess_collection be able to add information to existing collection of motifs. Make able to give collection a name from command line
 remove .stdin placeholder. Use tty? method instead
 use OptionParser or docopt
-make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
+make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)z

data/benchmark/similarity_benchmark.rb CHANGED Viewed

@@ -3,15 +3,15 @@ require_relative 'benchmark_helper'
 class TaskToBenchmark
   def setup
     @matrix_first = "KLF4_f2.xml
-      0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
-      -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
-      -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
-      -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
-      -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
-      -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
-      -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
-      -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
-      -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
+      0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
+      -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
+      -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
+      -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
+      -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
+      -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
+      -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
+      -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
+      -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
       -1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
       "
@@ -27,25 +27,25 @@ class TaskToBenchmark
     -0.4450938582835542  -2.2510053061629707  1.126543157436868  -1.7780413702431377
     -1.1896356092245055  -1.2251832285630033  1.163676006374752  -1.6080243648157357
     -0.5166047365590577  0.7641033353626651  -0.28626775700282125  -0.6825482097865606"
     @pvalue = 0.0005
-    @discretization = 10
+    @discretization = 1
     @first_background, @second_background = [1,1,1,1], [1,1,1,1]
-    @pwm_first = Bioinform::PWM.new(@matrix_first).background(@first_background).discrete(@discretization)
-    @pwm_second = Bioinform::PWM.new(@matrix_second).background(@second_background).discrete(@discretization)
+    @pwm_first = Bioinform::PWM.new(@matrix_first).set_parameters(background: @first_background).discrete(@discretization)
+    @pwm_second = Bioinform::PWM.new(@matrix_second).set_parameters(background: @second_background).discrete(@discretization)
     @cmp = Macroape::PWMCompare.new(@pwm_first, @pwm_second)
+    @first_threshold = @pwm_first.threshold(@pvalue)
+    @second_threshold = @pwm_second.threshold(@pvalue)
     self
   end
   def run
-    first_threshold = @pwm_first.threshold(@pvalue)
-    second_threshold = @pwm_second.threshold(@pvalue)
-    info = @cmp.jaccard(first_threshold, second_threshold)
+    info = @cmp.jaccard(@first_threshold, @second_threshold)
   end
 end
-benchmark_result = 10.times.collect do
+benchmark_result = 100.times.collect do
   task_to_benchmark = TaskToBenchmark.new.setup
   Benchmark.measure{  task_to_benchmark.run }
 end.inject(&:+)

data/lib/macroape/aligned_pair_intersection.rb CHANGED Viewed

@@ -18,8 +18,8 @@ module Macroape
         [result, result]
       end
     end
     # block has form: {|score,letter| contribution to count by `letter` with `score` }
     def get_counts(threshold_first, threshold_second, &count_contribution_block)
       # scores_on_first_pwm, scores_on_second_pwm --> count
@@ -34,7 +34,7 @@ module Macroape
           raise 'Hash overflow in Macroape::AlignedPairIntersection#counts_for_two_matrices_with_different_probabilities'
         end
       end
-      scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
+      scores.inject(0.0){|sum,(score_first, hsh)| sum + hsh.inject(0.0){|sum,(score_second, count)| sum + count }}
     end
     # wouldn't work without count_contribution_block
@@ -52,7 +52,7 @@ module Macroape
               end
             end
           end
         end
       end
       new_scores

data/lib/macroape/cli/align_motifs.rb CHANGED Viewed

@@ -1,49 +1,55 @@
+require 'docopt'
 require_relative '../../macroape'
 module Macroape
   module CLI
     module AlignMotifs
       def self.main(argv)
-        help_string = %q{
-        Usage:
-          ruby align_motifs pwm1_file pwm2_file pwm3_file
-          ruby align_motifs pcm1_file pcm2_file pcm3_file --pcm
-        Output:
-          pwm_1_file  shift_1  orientation_1
-          pwm_2_file  shift_2  orientation_2
-          pwm_3_file  shift_3  orientation_3
-        }
-        if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
-          STDERR.puts help_string
-          exit
-        end
+        doc = <<-DOCOPT.strip_doc
+          Align motifs tool.
+          It takes motifs and builds alignment of each motif to the first (leader) motif.
+          Output has format:
+            pwm_file_1  shift_1  orientation_1
+            pwm_file_2  shift_2  orientation_2
+            pwm_file_3  shift_3  orientation_3
+          Usage:
+            align_motifs [options] <pm-files>...
+          Options:
+            -h --help       Show this screen.
+            --pcm           Use PCMs instead of PWMs as input
+        DOCOPT
-        data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
-        leader = argv.shift
+        options = Docopt::docopt(doc, argv: argv)
+        data_model = options['--pcm'] ? Bioinform::PCM : Bioinform::PWM
+        motif_files = options['<pm-files>']
+        leader = motif_files.first
         background = [1,1,1,1]
-        discretization = 10
+        discretization = 1
         pvalue = 0.0005
         shifts = {leader => [0,:direct]}
-        pwm_first = data_model.new(File.read(leader)).to_pwm.set_parameters(background: background).discrete!(discretization)
-        argv.each do |motif_name|
-          pwm_second = data_model.new(File.read(motif_name)).to_pwm.set_parameters(background: background).discrete!(discretization)
-          cmp = Macroape::PWMCompare.new(pwm_first, pwm_second)
-          info = cmp.jaccard_by_pvalue(pvalue)
+        pwm_first = data_model.new(File.read(leader)).to_pwm
+        pwm_first.set_parameters(background: background).discrete!(discretization)
+        motif_files[1..-1].each do |motif_name|
+          pwm_second = data_model.new(File.read(motif_name)).to_pwm
+          pwm_second.set_parameters(background: background).discrete!(discretization)
+          info = Macroape::PWMCompare.new(pwm_first, pwm_second).jaccard_by_pvalue(pvalue)
           shifts[motif_name] = [info[:shift], info[:orientation]]
         end
         shifts.each do |motif_name, (shift,orientation)|
           puts "#{motif_name}\t#{shift}\t#{orientation}"
         end
-      rescue => err
-        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
+      rescue Docopt::Exit => e
+        puts e.message
       end
     end
   end
 end

data/lib/macroape/cli/eval_alignment.rb CHANGED Viewed

@@ -3,48 +3,40 @@ require_relative '../../macroape'
 module Macroape
   module CLI
     module EvalAlignment
       def self.main(argv)
-        help_string = %q{
-        Command-line format:
-        ruby eval_alignment.rb <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
-        type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
-             or in linux
-        cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_alignment.rb .stdin .stdin <shift> <orientation(direct/revcomp)> [options]
-        Options:
-          [-p <P-value>]
-          [-d <discretization level>]
-          [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
-        Output format:
-          <jaccard similarity coefficient>
-          <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the given alignment>
-          <aligned 1st matrix>
-          <aligned 2nd matrix>
-          <shift> <orientation>
-        Examples:
-          ruby eval_alignment.rb motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-             or on windows
-          type motifs/SP1.pat | ruby eval_alignment.rb motifs/KLF4.pat .stdin 0 revcomp -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-             or in linux
-          cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_alignment.rb .stdin .stdin 3 direct -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-        }
-        if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
-          STDERR.puts help_string
+        doc = <<-EOS.strip_doc
+          Command-line format:
+          #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> <shift> <orientation(direct/revcomp)> [options]
+          Options:
+            [-p <P-value>]
+            [-d <discretization level>]
+            [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
+            [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
+            [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
+            [--first-threshold <threshold for the first matrix>]
+            [--second-threshold <threshold for the second matrix>]
+          Examples:
+            #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -1 direct -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
+            #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat 3 revcomp
+        EOS
+        if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
+          STDERR.puts doc
           exit
         end
         pvalue = 0.0005
-        discretization = 10
+        discretization = 10.0
         first_background = [1,1,1,1]
         second_background = [1,1,1,1]
-        max_hash_size = 1000000
-        max_pair_hash_size = 1000
+        max_hash_size = 10000000
+        max_pair_hash_size = 10000
+        pvalue_boundary = :upper
         data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
         first_file = argv.shift
@@ -53,9 +45,9 @@ module Macroape
         shift = argv.shift
         orientation = argv.shift
-        raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
-        raise 'You\'d specify shift' unless shift
-        raise 'You\'d specify orientation' unless orientation
+        raise 'You should specify two input sources (each is filename or .stdin)'  unless first_file and second_file
+        raise 'You should specify shift' unless shift
+        raise 'You should specify orientation' unless orientation
         shift = shift.to_i
         orientation = orientation.to_sym
@@ -76,16 +68,23 @@ module Macroape
               pvalue = argv.shift.to_f
             when '-d'
               discretization = argv.shift.to_f
-            when '-m'
+            when '--max-hash-size'
               max_hash_size = argv.shift.to_i
-            when '-md'
+            when '--max-2d-hash-size'
               max_pair_hash_size = argv.shift.to_i
             when '-b'
-              second_background = first_background = argv.shift(4).map(&:to_f)
+              second_background = first_background = argv.shift.split(',').map(&:to_f)
             when '-b1'
-              first_background = argv.shift(4).map(&:to_f)
+              first_background = argv.shift.split(',').map(&:to_f)
             when '-b2'
-              second_background = argv.shift(4).map(&:to_f)
+              second_background = argv.shift.split(',').map(&:to_f)
+            when '--boundary'
+              pvalue_boundary = argv.shift.to_sym
+              raise 'boundary should be either lower or upper'  unless  pvalue_boundary == :lower || pvalue_boundary == :upper
+            when '--first-threshold'
+              predefined_threshold_first = argv.shift.to_f
+            when '--second-threshold'
+              predefined_threshold_second = argv.shift.to_f
           end
         end
         raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
@@ -95,7 +94,7 @@ module Macroape
           input = $stdin.read
           parser = data_model.choose_parser(input).new(input)
         end
         if first_file == '.stdin'
           input_first = parser.parse
         else
@@ -111,18 +110,45 @@ module Macroape
           input_second = File.read(second_file)
         end
         pwm_second = data_model.new(input_second).to_pwm
         pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
         pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
         cmp = Macroape::PWMCompareAligned.new(pwm_first, pwm_second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)
-        info = cmp.alignment_infos.merge( cmp.jaccard_by_pvalue(pvalue) )
+        if predefined_threshold_first
+          threshold_first = predefined_threshold_first * discretization
+        else
+          if pvalue_boundary == :lower
+            threshold_first = pwm_first.threshold(pvalue)
+          else
+            threshold_first = pwm_first.weak_threshold(pvalue)
+          end
+        end
-        puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
+        if predefined_threshold_second
+          threshold_second = predefined_threshold_second * discretization
+        else
+          if pvalue_boundary == :lower
+            threshold_second = pwm_second.threshold(pvalue)
+          else
+            threshold_second = pwm_second.weak_threshold(pvalue)
+          end
+        end
+        info = cmp.alignment_infos.merge( cmp.jaccard(threshold_first, threshold_second) )
+        info.merge!(predefined_threshold_first: predefined_threshold_first,
+                    predefined_threshold_second: predefined_threshold_second,
+                    threshold_first: threshold_first / discretization,
+                    threshold_second: threshold_second / discretization,
+                    discretization: discretization,
+                    first_background: first_background,
+                    second_background: second_background,
+                    requested_pvalue: pvalue,
+                    pvalue_boundary: pvalue_boundary)
+        puts Helper.similarity_info_string(info)
       rescue => err
-        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
+        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
       end
     end

data/lib/macroape/cli/eval_similarity.rb CHANGED Viewed

@@ -3,54 +3,44 @@ require_relative '../../macroape'
 module Macroape
   module CLI
     module EvalSimilarity
       def self.main(argv)
-        help_string = %q{
+        doc = <<-EOS.strip_doc
         Command-line format:
-        ruby eval_similarity.rb <1st matrix pat-file> <2nd matrix pat-file> [options]
-             or on windows
-        type <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
-             or in linux
-        cat <1st matrix pat-file> <2nd matrix pat-file> | ruby eval_similarity.rb .stdin .stdin [options]
+        #{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
         Options:
           [-p <P-value>]
           [-d <discretization level>]
-          [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>]
-        Output has format:
-          <jaccard similarity coefficient>
-          <number of words recognized by both 1st and 2nd matrices | probability to draw a word recognized by both 1st and 2nd matrices> <length of the optimal alignment>
-          <optimal alignment, the 1st matrix>
-          <optimal alignment, the 2nd matrix>
-          <shift> <orientation>
+          [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
+          [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
+          [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
+          [--first-threshold <threshold for the first matrix>]
+          [--second-threshold <threshold for the second matrix>]
         Examples:
-          ruby eval_similarity.rb motifs/KLF4.pat motifs/SP1.pat -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-             or on windows
-          type motifs/SP1.pat | ruby eval_similarity.rb motifs/KLF4.pat .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-             or in linux
-          cat motifs/KLF4.pat motifs/SP1.pat | ruby eval_similarity.rb .stdin .stdin -p 0.0005 -d 100 -b 0.4 0.3 0.2 0.1
-        }
-        if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
-          STDERR.puts help_string
+          #{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
+        EOS
+        if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
+          STDERR.puts doc
           exit
         end
         pvalue = 0.0005
-        discretization = 10
+        discretization = 10.0
         first_background = [1,1,1,1]
         second_background = [1,1,1,1]
-        max_hash_size = 1000000
-        max_pair_hash_size = 1000
+        max_hash_size = 10000000
+        max_pair_hash_size = 10000
+        pvalue_boundary = :upper
-        data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
+        data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
         first_file = argv.shift
         second_file = argv.shift
-        raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
+        raise 'You should specify two input files' unless first_file and second_file
         until argv.empty?
           case argv.shift
@@ -58,16 +48,23 @@ module Macroape
               pvalue = argv.shift.to_f
             when '-d'
               discretization = argv.shift.to_f
-            when '-m'
+            when '--max-hash-size'
               max_hash_size = argv.shift.to_i
-            when '-md'
+            when '--max-2d-hash-size'
               max_pair_hash_size = argv.shift.to_i
             when '-b'
-              second_background = first_background = argv.shift(4).map(&:to_f)
+              second_background = first_background = argv.shift.split(',').map(&:to_f)
             when '-b1'
-              first_background = argv.shift(4).map(&:to_f)
+              first_background = argv.shift.split(',').map(&:to_f)
             when '-b2'
-              second_background = argv.shift(4).map(&:to_f)
+              second_background = argv.shift.split(',').map(&:to_f)
+            when '--boundary'
+              pvalue_boundary = argv.shift.to_sym
+              raise 'boundary should be either lower or upper'  unless  pvalue_boundary == :lower || pvalue_boundary == :upper
+            when '--first-threshold'
+              predefined_threshold_first = argv.shift.to_f
+            when '--second-threshold'
+              predefined_threshold_second = argv.shift.to_f
           end
         end
         raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background == first_background.reverse
@@ -77,7 +74,7 @@ module Macroape
           input = $stdin.read
           parser = data_model.choose_parser(input).new(input)
         end
         if first_file == '.stdin'
           input_first = parser.parse
         else
@@ -93,20 +90,48 @@ module Macroape
           input_second = File.read(second_file)
         end
         pwm_second = data_model.new(input_second).to_pwm
         pwm_first.set_parameters(background: first_background, max_hash_size: max_hash_size).discrete!(discretization)
         pwm_second.set_parameters(background: second_background, max_hash_size: max_hash_size).discrete!(discretization)
         cmp = Macroape::PWMCompare.new(pwm_first, pwm_second).set_parameters(max_pair_hash_size: max_pair_hash_size)
-        info = cmp.jaccard_by_pvalue(pvalue)
+        if predefined_threshold_first
+          threshold_first = predefined_threshold_first * discretization
+        else
+          if pvalue_boundary == :lower
+            threshold_first = pwm_first.threshold(pvalue)
+          else
+            threshold_first = pwm_first.weak_threshold(pvalue)
+          end
+        end
+        if predefined_threshold_second
+          threshold_second = predefined_threshold_second * discretization
+        else
+          if pvalue_boundary == :lower
+            threshold_second = pwm_second.threshold(pvalue)
+          else
+            threshold_second = pwm_second.weak_threshold(pvalue)
+          end
+        end
-        puts "#{info[:similarity]}\n#{info[:recognized_by_both]}\t#{info[:alignment_length]}\n#{info[:text]}\n#{info[:shift]}\t#{info[:orientation]}"
+        info = cmp.jaccard(threshold_first, threshold_second)
+        info.merge!(predefined_threshold_first: predefined_threshold_first,
+                    predefined_threshold_second: predefined_threshold_second,
+                    threshold_first: threshold_first.to_f / discretization,
+                    threshold_second: threshold_second.to_f / discretization,
+                    discretization: discretization,
+                    first_background: first_background,
+                    second_background: second_background,
+                    requested_pvalue: pvalue,
+                    pvalue_boundary: pvalue_boundary)
+        puts Helper.similarity_info_string(info)
       rescue => err
-        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
+        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
       end
     end
   end
 end