RubyGems - macroape - Versions diffs - 3.3.7 → 3.3.8 - Mend

macroape 3.3.7 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/README.md +2 -2
data/Rakefile.rb +6 -6
data/TODO.txt +23 -3
data/benchmark/similarity_benchmark.rb +18 -18
data/lib/macroape/aligned_pair_intersection.rb +4 -4
data/lib/macroape/cli/align_motifs.rb +34 -28
data/lib/macroape/cli/eval_alignment.rb +73 -47
data/lib/macroape/cli/eval_similarity.rb +65 -40
data/lib/macroape/cli/find_pvalue.rb +30 -34
data/lib/macroape/cli/find_threshold.rb +52 -41
data/lib/macroape/cli/preprocess_collection.rb +68 -58
data/lib/macroape/cli/scan_collection.rb +89 -73
data/lib/macroape/cli.rb +184 -1
data/lib/macroape/counting.rb +31 -5
data/lib/macroape/pwm_compare.rb +8 -2
data/lib/macroape/pwm_compare_aligned.rb +15 -10
data/lib/macroape/version.rb +2 -1
data/macroape.gemspec +2 -1
data/spec/count_distribution_spec.rb +11 -11
data/test/align_motifs_test.rb +16 -4
data/test/data/{AHR_si.pat → AHR_si.pwm} +0 -0
data/test/data/{KLF3_f1.pat → KLF3_f1.pwm} +0 -0
data/test/data/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
data/test/data/KLF4_f2_scan_results_all.txt +1 -2
data/test/data/KLF4_f2_scan_results_default_cutoff.txt +1 -2
data/test/data/KLF4_f2_scan_results_precise_mode.txt +1 -2
data/test/data/KLF4_f2_scan_results_weak_threshold.txt +2 -0
data/test/data/{SP1_f1.pat → SP1_f1.pwm} +0 -0
data/test/data/{SP1_f1_revcomp.pat → SP1_f1_revcomp.pwm} +0 -0
data/test/data/collection_pcm_without_thresholds.yaml +186 -183
data/test/data/collection_without_thresholds.yaml +186 -183
data/test/data/{medium_motif.pat → medium_motif.pwm} +0 -0
data/test/data/{short_motif.pat → short_motif.pwm} +0 -0
data/test/data/test_collection/{GABPA_f1.pat → GABPA_f1.pwm} +0 -0
data/test/data/test_collection/{KLF4_f2.pat → KLF4_f2.pwm} +0 -0
data/test/data/test_collection/{SP1_f1.pat → SP1_f1.pwm} +0 -0
data/test/data/test_collection.yaml +179 -176
data/test/data/test_collection_weak.yaml +214 -0
data/test/eval_alignment_test.rb +97 -21
data/test/eval_similarity_test.rb +104 -26
data/test/find_pvalue_test.rb +22 -9
data/test/find_threshold_test.rb +76 -25
data/test/preprocess_collection_test.rb +16 -21
data/test/scan_collection_test.rb +26 -14
data/test/test_helper.rb +96 -12
metadata +44 -24

data/lib/macroape/cli/scan_collection.rb CHANGED Viewed

@@ -4,64 +4,66 @@ require 'yaml'
 module Macroape
   module CLI
     module ScanCollection
       def self.main(argv)
-        help_string = %q{
-        Command-line format:
-        ruby scan_collection.rb <pat-file> <collection> [options]
-                or in linux
-        cat <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
-                or on windows
-        type <pat-file> | ruby scan_collection.rb .stdin <collection> [options]
-        Options:
-          [-p <P-value>]
-          [-c <similarity cutoff (minimal similarity to be included in output)> ] or [--all], '-c 0.05' by default
-          [--precise [<level, minimal similarity to check on a more precise discretization level on the second pass>]], off by default, '--precise 0.01' if level is not set
-          [--silent] - don't show current progress information during scan (by default this information's written into stderr)
-        Output format:
-         <name> <similarity jaccard index> <shift> <overlap> <orientation> * [in case that result calculated on the second pass(in precise mode)]
-            Attention! Name can contain whitespace characters.
-            Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
-        Example:
-          ruby scan_collection.rb motifs/KLF4.pat collection.yaml -p 0.005
-                    or in linux
-          cat motifs/KLF4.pat | ruby scan_collection.rb .stdin collection.yaml -p 0.005 --precise 0.03
-        }
-        if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
-          STDERR.puts help_string
+        doc = <<-EOS.strip_doc
+          Command-line format:
+          #{run_tool_cmd} <pat-file> <collection> [options]
+          Options:
+            [-p <P-value>]
+            [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
+            [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
+            [--silent] - hide current progress information during scan (printed to stderr by default)
+            [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
+            [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
+            [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
+          Output format:
+           <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
+              Attention! Name can contain whitespace characters.
+              Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
+          Example:
+            #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
+            #{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
+        EOS
+        if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
+          STDERR.puts doc
           exit
         end
         data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
         filename = argv.shift
         collection_file = argv.shift
-        raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
-        raise "No input. You'd specify input file with collection" unless collection_file
+        raise 'No input. You should specify input file with matrix' unless filename
+        raise 'No input. You should specify input file with collection' unless collection_file
         raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
         pvalue = 0.0005
         cutoff = 0.05 # minimal similarity to output
         collection = YAML.load_file(collection_file)
-        background_query = collection.parameters.background
-        max_hash_size = 1000000
-        max_pair_hash_size = 1000
+        collection_background = collection.parameters.background
+        query_background = collection_background
+        rough_discretization = collection.parameters.rough_discretization
+        precise_discretization = collection.parameters.precise_discretization
+        max_hash_size = 10000000
+        max_pair_hash_size = 10000
+        pvalue_boundary = :upper
         silent = false
         precision_mode = :rough
         until argv.empty?
           case argv.shift
-            when '-bq'
-              background_query = argv.shift(4).map(&:to_f)
-              raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background_query == background_query.reverse
+            when '-b'
+              query_background = argv.shift.split(',').map(&:to_f)
+              raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background == query_background.reverse
             when '-p'
               pvalue = argv.shift.to_f
-            when '-m'
-              max_hash_size = argv.shift.to_i
-            when '-md'
+            when '--max-hash-size'
+              max_hash_size = argv.shift.to_i
+            when '--max-2d-hash-size'
               max_pair_hash_size = argv.shift.to_i
             when '-c'
               cutoff = argv.shift.to_f
@@ -69,6 +71,9 @@ module Macroape
               cutoff = 0.0
             when '--silent'
               silent = true
+            when '--boundary'
+              pvalue_boundary = argv.shift.to_sym
+              raise 'boundary should be either lower or upper'  unless  pvalue_boundary == :lower || pvalue_boundary == :upper
             when '--precise'
               precision_mode = :precise
               begin
@@ -81,7 +86,7 @@ module Macroape
         end
         raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.parameters.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.parameters.pvalues.include? pvalue
         if filename == '.stdin'
           query_input = $stdin.read
         else
@@ -90,58 +95,69 @@ module Macroape
         end
         query_pwm = data_model.new(query_input).to_pwm
-        query_pwm.set_parameters(background: background_query, max_hash_size: max_hash_size)
-        query_pwm_rough = query_pwm.discrete(collection.parameters.rough_discretization)
-        query_pwm_precise = query_pwm.discrete(collection.parameters.precise_discretization)
-        query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
-        query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
+        query_pwm.set_parameters(background: query_background, max_hash_size: max_hash_size)
+        query_pwm_rough = query_pwm.discrete(rough_discretization)
+        query_pwm_precise = query_pwm.discrete(precise_discretization)
+        if pvalue_boundary == :lower
+          query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.threshold_and_real_pvalue(pvalue)
+          query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.threshold_and_real_pvalue(pvalue)
+        else
+          query_threshold_rough, query_rough_real_pvalue = query_pwm_rough.weak_threshold_and_real_pvalue(pvalue)
+          query_threshold_precise, query_precise_real_pvalue = query_pwm_precise.weak_threshold_and_real_pvalue(pvalue)
+        end
         if query_precise_real_pvalue == 0
-          $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{collection.parameters.precise_discretization}. It's impossible to scan collection for this motif"
+          $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
           return
         end
         if query_rough_real_pvalue == 0
           query_pwm_rough, query_threshold_rough = query_pwm_precise, query_threshold_precise
-          $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{collection.parameters.rough_discretization}. Forcing precise discretization level of #{collection.parameters.precise_discretization}"
+          $stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
         end
         similarities = {}
         precision_file_mode = {}
-        collection.each do |collection_pwm, pwm_info|
-          name = collection_pwm.name
-          STDERR.puts name unless silent
-          collection_pwm.set_parameters(background: collection.parameters.background, max_hash_size: max_hash_size)
-          if pwm_info.rough
-            collection_pwm_rough = collection_pwm.discrete(collection.parameters.rough_discretization)
-            collection_threshold_rough = pwm_info.rough[pvalue] * collection.parameters.rough_discretization
+        collection.each_with_index do |motif, index|
+          name = motif.name
+          STDERR.puts "Testing motif #{name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)"  unless silent
+          motif.set_parameters(background: collection_background, max_hash_size: max_hash_size)
+          if motif.rough[pvalue]
+            collection_pwm_rough = motif.pwm.discrete(rough_discretization)
+            collection_threshold_rough = motif.rough[pvalue] * rough_discretization
             info = Macroape::PWMCompare.new(query_pwm_rough, collection_pwm_rough).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_rough, collection_threshold_rough)
-            precision_file_mode[name] = :rough
+            info[:precision_mode] = :rough
           end
-          if !pwm_info.rough || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
-            collection_pwm_precise = collection_pwm.discrete(collection.parameters.precise_discretization)
-            collection_threshold_precise = pwm_info.precise[pvalue] * collection.parameters.precise_discretization
+          if !motif.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
+            collection_pwm_precise = motif.pwm.discrete(precise_discretization)
+            collection_threshold_precise = motif.precise[pvalue] * precise_discretization
             info = Macroape::PWMCompare.new(query_pwm_precise, collection_pwm_precise).set_parameters(max_pair_hash_size: max_pair_hash_size).jaccard(query_threshold_precise, collection_threshold_precise)
-            precision_file_mode[name] = :precise
+            info[:precision_mode] = :precise
           end
+          info[:name] = name
           similarities[name] = info
         end
-        puts "#pwm\tsimilarity\tshift\toverlap\torientation"
-        similarities.sort_by do |name, info|
-          info[:similarity]
-        end.reverse.each do |name, info|
-          precision_text = (precision_file_mode[name] == :precise) ? "\t*" : ""
-          puts "#{name}\t#{info[:similarity]}\t#{info[:shift]}\t#{info[:overlap]}\t#{info[:orientation]}#{precision_text}" if info[:similarity] >= cutoff
-        end
+        STDERR.puts "100% complete"  unless silent
+        similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
+        puts Helper.scan_collection_infos_string( similarities_to_output,
+                                                  {cutoff: cutoff,
+                                                  precision_mode: precision_mode,
+                                                  rough_discretization: rough_discretization,
+                                                  precise_discretization: precise_discretization,
+                                                  minimal_similarity: minimal_similarity,
+                                                  pvalue: pvalue,
+                                                  pvalue_boundary: pvalue_boundary,
+                                                  collection_background: collection_background,
+                                                  query_background: query_background} )
       rescue => err
-        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n"
+        STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
       end
     end
   end
 end

data/lib/macroape/cli.rb CHANGED Viewed

@@ -1,5 +1,188 @@
+require 'bioinform/support/strip_doc'
+class String
+  def snake_case
+    gsub(/[A-Z]+/){|big| "_#{big.downcase}" }.sub(/^_/,'')
+  end
+end
+class Module
+  def run_tool_cmd
+    if Macroape::STANDALONE
+      "ruby #{tool_name}.rb"
+    else
+      tool_name
+    end
+  end
+  def tool_name
+    self.name.split('::').last.snake_case
+  end
+end
 module Macroape
   module CLI
+    class OutputInformation
+      def initialize(data = nil)
+        @table_parameter_descriptions = []
+        @parameter_descriptions = []
+        @parameter_value_infos = []
+        @resulting_value_descriptions = []
+        @resulting_value_infos = []
+        @table_headers = []
+        @table_rows = []
+        @table_rows_callbacks = []
+        @data = data
+        yield self  if block_given?
+      end
+      def parameters_info
+        [*@parameter_descriptions, *@parameter_value_infos]
+      end
+      def resulting_values_info
+        [*@resulting_value_descriptions, *@resulting_value_infos]
+      end
+      def result
+        [parameters_info, resulting_values_info, resulting_table].reject(&:empty?).map{|b|b.join("\n")}.join("\n#\n")
+        #[*parameters_info, '#', *resulting_values_info, '#', *resulting_table].join("\n")
+      end
+      def add_parameter(param_name, description, value, &block)
+        @parameter_descriptions << parameter_description_string(param_name, description)
+        @parameter_value_infos << "# #{param_name} = #{value}"
+      end
+      def add_resulting_value(param_name, description, value, &block)
+        @resulting_value_descriptions << parameter_description_string(param_name, description)
+        @resulting_value_infos << "#{param_name}\t#{value}"
+      end
+      def add_table_parameter(param_name, description, key_in_hash, &block)
+        @table_parameter_descriptions << parameter_description_string(param_name, description)
+        add_table_parameter_without_description(param_name, key_in_hash, &block)
+      end
+      def add_table_parameter_without_description(param_name, key_in_hash, &block)
+        @table_headers << param_name
+        @table_rows << key_in_hash
+        @table_rows_callbacks << block
+      end
+      def parameter_description_string(param_name, description)
+        "# #{param_name}: #{description}"
+      end
+      def table_content
+        @data.map{|info|
+          @table_rows.zip(@table_rows_callbacks).map{|row,callback| callback ? callback.call(info[row]) : info[row] }.join("\t")
+        }
+      end
+      def header_content
+        '# ' + @table_headers.join("\t")
+      end
+      def resulting_table
+        @data ? [*@table_parameter_descriptions, header_content, *table_content] : []
+      end
+      # printed only if it is not wordwise [1,1,1,1]
+      def background_parameter(param_name, description, value, &block)
+        add_parameter(param_name, description, value.join(','), &block)  unless value == [1,1,1,1]
+      end
+    end
+    module Helper
+      def self.similarity_info_string(info)
+        OutputInformation.new { |infos|
+          infos.add_parameter('V', 'discretization', info[:discretization] )
+          infos.add_parameter('P', 'requested P-value', info[:requested_pvalue])  unless info[:predefined_threshold_first] && info[:predefined_threshold_second]
+          infos.add_parameter('T1', 'threshold for the 1st matrix', info[:predefined_threshold_first] )  if info[:predefined_threshold_first]
+          infos.add_parameter('T2', 'threshold for the 2nd matrix', info[:predefined_threshold_second] )  if info[:predefined_threshold_second]
+          infos.add_parameter('PB', 'P-value boundary', info[:pvalue_boundary])
+          if info[:first_background] == info[:second_background]
+            infos.background_parameter('B', 'background', info[:first_background])
+          else
+            infos.background_parameter('B1', 'background for the 1st model', info[:first_background])
+            infos.background_parameter('B2', 'background for the 2nd model', info[:second_background])
+          end
+          infos.add_resulting_value('S', 'similarity', info[:similarity])
+          infos.add_resulting_value('D', 'distance (1-similarity)', info[:tanimoto])
+          infos.add_resulting_value('L', 'length of the alignment', info[:alignment_length])
+          infos.add_resulting_value('SH', 'shift of the 2nd PWM relative to the 1st', info[:shift])
+          infos.add_resulting_value('OR', 'orientation of the 2nd PWM relative to the 1st', info[:orientation])
+          infos.add_resulting_value('A1', 'aligned 1st matrix', info[:text].lines.to_a.first.strip )
+          infos.add_resulting_value('A2', 'aligned 2nd matrix', info[:text].lines.to_a.last.strip )
+          infos.add_resulting_value('W', 'number of words recognized by both models (model = PWM + threshold)', info[:recognized_by_both] )
+          infos.add_resulting_value('W1', 'number of words and recognized by the first model', info[:recognized_by_first] )
+          infos.add_resulting_value('P1', 'P-value for the 1st matrix', info[:real_pvalue_first] )
+          infos.add_resulting_value('T1', 'threshold for the 1st matrix', info[:threshold_first] )  unless info[:predefined_threshold_first]
+          infos.add_resulting_value('W2', 'number of words recognized by the 2nd model', info[:recognized_by_second] )
+          infos.add_resulting_value('P2', 'P-value for the 2nd matrix', info[:real_pvalue_second] )
+          infos.add_resulting_value('T2', 'threshold for the 2nd matrix', info[:threshold_second] )  unless info[:predefined_threshold_second]
+        }.result
+      end
+############################################
+      def self.threshold_infos_string(data, parameters)
+        OutputInformation.new(data) { |infos|
+          infos.add_parameter('V', 'discretization value', parameters[:discretization])
+          infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
+          infos.background_parameter('B', 'background', parameters[:background])
+          infos.add_table_parameter('P', 'requested P-value', :expected_pvalue)
+          infos.add_table_parameter('AP', 'actual P-value', :real_pvalue)
+          infos.add_table_parameter('W', 'number of recognized words', :recognized_words)  if parameters[:background] == [1, 1, 1, 1]
+          infos.add_table_parameter('T', 'threshold', :threshold)
+        }.result
+      end
+############################################
+      def self.scan_collection_infos_string(data, parameters)
+        OutputInformation.new(data) { |infos|
+          infos.add_parameter('MS', 'minimal similarity to output', parameters[:cutoff])
+          infos.add_parameter('P', 'P-value', parameters[:pvalue])
+          infos.add_parameter('PB', 'P-value boundary', parameters[:pvalue_boundary])
+          if parameters[:precision_mode] == :precise
+            infos.add_parameter('VR', 'discretization value, rough', parameters[:rough_discretization])
+            infos.add_parameter('VP', 'discretization value, precise', parameters[:precise_discretization])
+            infos.add_parameter('MP', 'minimal similarity for the 2nd pass in \'precise\' mode', parameters[:minimal_similarity])
+          else
+            infos.add_parameter('V', 'discretization value', parameters[:rough_discretization])
+          end
+          infos.background_parameter('BQ', 'background for query matrix', parameters[:query_background])
+          infos.background_parameter('BC', 'background for collection', parameters[:collection_background])
+          infos.add_table_parameter_without_description('motif', :name)
+          infos.add_table_parameter_without_description('similarity', :similarity)
+          infos.add_table_parameter_without_description('shift', :shift)
+          infos.add_table_parameter_without_description('overlap', :overlap)
+          infos.add_table_parameter_without_description('orientation', :orientation)
+          if parameters[:precision_mode] == :precise
+            infos.add_table_parameter_without_description('precise mode', :precision_mode){|precision| precision == :precise ? '*' : '.' }
+          end
+        }.result
+      end
+############################################
+      def self.find_pvalue_info_string(data, parameters)
+        OutputInformation.new(data) {|infos|
+          infos.add_parameter('V', 'discretization value', parameters[:discretization])
+          infos.background_parameter('B', 'background', parameters[:background])
+          infos.add_table_parameter('T', 'threshold', :threshold)
+          infos.add_table_parameter('W', 'number of recognized words', :number_of_recognized_words)  if parameters[:background] == [1,1,1,1]
+          infos.add_table_parameter('P', 'P-value', :pvalue)
+        }.result
+      end
+    end
   end
 end

data/lib/macroape/counting.rb CHANGED Viewed

@@ -4,13 +4,19 @@ module Bioinform
   class PWM
     # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
     make_parameters :max_hash_size
     def threshold(pvalue)
       thresholds(pvalue){|_, thresh, _| return thresh }
     end
     def threshold_and_real_pvalue(pvalue)
       thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
     end
+    def weak_threshold(pvalue)
+      weak_thresholds(pvalue){|_, thresh, _| return thresh }
+    end
+    def weak_threshold_and_real_pvalue(pvalue)
+      weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
+    end
     def thresholds(*pvalues)
       thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
@@ -20,11 +26,26 @@ module Bioinform
       end
     end
+    # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
+    def weak_thresholds(*pvalues)
+      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
+        threshold = thresholds.begin.to_f
+        real_pvalue = counts.begin.to_f / vocabulary_volume
+        yield pvalue, threshold, real_pvalue
+      end
+    end
     def count_distribution_under_pvalue(max_pvalue)
       cnt_distribution = {}
       look_for_count = max_pvalue * vocabulary_volume
       until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
-        cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
+        begin
+          approximate_threshold = threshold_gauss_estimation(max_pvalue)
+        rescue
+          approximate_threshold = worst_score
+        end
+        cnt_distribution = count_distribution_after_threshold(approximate_threshold)
         max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
       end
@@ -83,13 +104,18 @@ module Bioinform
     def counts_by_thresholds(*thresholds)
       scores = count_distribution_after_threshold(thresholds.min)
-      thresholds.map{ |threshold|
-        scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+      thresholds.inject({}){ |hsh, threshold|
+        hsh[threshold] = scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+        hsh
       }
     end
+    def count_by_threshold(threshold)
+      counts_by_thresholds(threshold)[threshold]
+    end
     def pvalue_by_threshold(threshold)
-      counts_by_thresholds(threshold).first / vocabulary_volume
+      count_by_threshold(threshold) / vocabulary_volume
     end
   end
 end

data/lib/macroape/pwm_compare.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'bioinform/support/parameters'
 module Macroape
   class PWMCompare
-    include Parameters
+    include Bioinform::Parameters
     # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
     make_parameters :max_pair_hash_size
@@ -18,13 +18,19 @@ module Macroape
         alignment.alignment_infos.merge( alignment.jaccard(threshold_first, threshold_second) )
       end.max_by {|alignment_infos| alignment_infos[:similarity] }
     end
     def jaccard_by_pvalue(pvalue)
       threshold_first = first.threshold(pvalue)
       threshold_second = second.threshold(pvalue)
       jaccard(threshold_first, threshold_second)
     end
+    def jaccard_by_weak_pvalue(pvalue)
+      threshold_first = first.weak_threshold(pvalue)
+      threshold_second = second.weak_threshold(pvalue)
+      jaccard(threshold_first, threshold_second)
+    end
     def each_alignment
       (-second.length..first.length).to_a.product([:direct,:revcomp]) do |shift, orientation|
         yield PWMCompareAligned.new(first, second, shift, orientation).set_parameters(max_pair_hash_size: max_pair_hash_size)

data/lib/macroape/pwm_compare_aligned.rb CHANGED Viewed

@@ -1,14 +1,14 @@
 require 'bioinform/support/parameters'
-require_relative './aligned_pair_intersection'
+require_relative 'aligned_pair_intersection'
 module Macroape
   class PWMCompareAligned
-    include Parameters
+    include Bioinform::Parameters
     # sets or gets limit of summary size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
     make_parameters :max_pair_hash_size
     attr_reader :first, :second, :length, :shift, :orientation, :first_length, :second_length, :parameters
     def initialize(first_unaligned, second_unaligned, shift, orientation)
       @parameters = OpenStruct.new
       @shift, @orientation = shift, orientation
@@ -18,7 +18,7 @@ module Macroape
       first, second = first_unaligned, second_unaligned
       second = second.reverse_complement  if revcomp?
       if shift > 0
         second = second.left_augment(shift)
       else
@@ -28,8 +28,6 @@ module Macroape
       @first = first.right_augment(@length - first.length)
       @second = second.right_augment(@length - second.length)
     end
     def direct?
       orientation == :direct
@@ -90,8 +88,8 @@ module Macroape
     end
     def jaccard(first_threshold, second_threshold)
-      f = first.counts_by_thresholds(first_threshold).first
-      s = second.counts_by_thresholds(second_threshold).first
+      f = first.count_by_threshold(first_threshold)
+      s = second.count_by_threshold(second_threshold)
       if f == 0 || s == 0
         return {similarity: -1, tanimoto: -1, recognized_by_both: 0,
               recognized_by_first: f,
@@ -104,15 +102,22 @@ module Macroape
       union = f + s - intersect
       similarity = intersect.to_f / union
       { similarity: similarity,  tanimoto: 1.0 - similarity,  recognized_by_both: intersect,
-        recognized_by_first: f,  recognized_by_second: s }
+        recognized_by_first: f,  recognized_by_second: s,
+        real_pvalue_first: f / first.vocabulary_volume, real_pvalue_second: s / second.vocabulary_volume }
     end
     def jaccard_by_pvalue(pvalue)
       threshold_first = first.threshold(pvalue)
       threshold_second = second.threshold(pvalue)
       jaccard(threshold_first, threshold_second)
     end
+    def jaccard_by_weak_pvalue(pvalue)
+      threshold_first = first.weak_threshold(pvalue)
+      threshold_second = second.weak_threshold(pvalue)
+      jaccard(threshold_first, threshold_second)
+    end
     def self.calculate_alignment_length(first_len, second_len, shift)
       if shift > 0
         [first_len, second_len + shift].max

data/lib/macroape/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 module Macroape
-  VERSION = "3.3.7"
+  VERSION = "3.3.8"
+  STANDALONE = false
 end

data/macroape.gemspec CHANGED Viewed

@@ -15,5 +15,6 @@ Gem::Specification.new do |gem|
   gem.require_paths = ["lib"]
   gem.version       = Macroape::VERSION
-  gem.add_dependency('bioinform', '= 0.1.8')
+  gem.add_dependency('bioinform', '= 0.1.9')
+  gem.add_dependency('docopt', '= 0.5.0')
 end