macroape 3.3.4 → 3.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -20,7 +20,7 @@ Or install it yourself as:
20
20
  For more information read manual at https://docs.google.com/document/pub?id=1_jsxhMNzMzy4d2d_byAd3n6Szg5gEcqG_Sf7w9tEqWw (not last version but comprehensive description of approach)
21
21
 
22
22
  ## Basic usage as a command-line tool
23
- MacroAPE have 6 command line tools:
23
+ MacroAPE have 7 command line tools:
24
24
 
25
25
  ### Tools for calculating thresholds and pvalues:
26
26
  * find_threshold \<PWM file\> [-p \<pvalue\> (by default: 0.0005)]
@@ -34,8 +34,12 @@ Or install it yourself as:
34
34
  * preprocess_collection \<folder with motif files\> [-o \<collection output file\>]
35
35
  * scan_collection \<query PWM file\> \<collection file\>
36
36
 
37
+ ### Tool for finding mutual alignment of several motifs relative to first(leader) motif. It's designed to use with sequence_logo to draw logos of clusters
38
+ * align_motifs \<pwm_leader\> \<pwm_2\> \<pwm_3\> ...
39
+
37
40
  Also you can use -h option to print help for a tool in console.
38
41
  There are lots of different command line options. Most useful option is -d <discretization=1|10|100|1000>. You can vary precision/speed rate by specifing a discretization. For more information look through a manual.
42
+ Some of tools also can process PCMs in addition to PWMs.
39
43
 
40
44
  ## Basic usage in your code
41
45
  require 'macroape'
@@ -51,7 +55,7 @@ Or install it yourself as:
51
55
  similarity_info = cmp.jaccard(first_threshold, second_threshold)
52
56
  puts "Jaccard similarity: #{similarity_info[:similarity]}"
53
57
 
54
- For more details look a source code of utilities in lib/exec/ folder
58
+ For more details look a source code of utilities in lib/macroape/cli/ folder
55
59
 
56
60
  ## Contributing
57
61
 
data/TODO.txt CHANGED
@@ -1,6 +1,3 @@
1
- Absolutely necessary:
2
- (already work in preprocess_colleсtion and align_motifs) Make it available to load PCM files with (it should be first preprocessed to PWMs in a standardized way) -- may be it's better to use pipeline
3
-
4
1
  Specs and tests:
5
2
  create spec on use of MaxHashSize, MaxHashSizeDouble
6
3
  create spec for testing case when {real_pvalue == 0, threshold == best_score + 1}
@@ -17,5 +14,5 @@ Usability issues:
17
14
 
18
15
  remove .stdin placeholder. Use tty? method instead
19
16
 
20
- use OptionParser (??? can OptionParser get stub ARGV ???)
17
+ use OptionParser or docopt
21
18
  make options more uniform so that some of them were reusable(and the question: can I apply two option parsers consequently?)
@@ -44,7 +44,8 @@ module Macroape
44
44
  second_background = [1,1,1,1]
45
45
  max_hash_size = 1000000
46
46
  max_pair_hash_size = 1000
47
-
47
+
48
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
48
49
 
49
50
  first_file = argv.shift
50
51
  second_file = argv.shift
@@ -93,18 +94,20 @@ module Macroape
93
94
  parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
94
95
 
95
96
  if first_file == '.stdin'
96
- pwm_first = Bioinform::PWM.new( parser.parse )
97
+ input_first = parser.parse
97
98
  else
98
99
  raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
99
- pwm_first = Bioinform::PWM.new(File.read(first_file))
100
+ input_first = File.read(first_file)
100
101
  end
102
+ pwm_first = data_model.new(input_first).to_pwm
101
103
 
102
104
  if second_file == '.stdin'
103
- pwm_second = Bioinform::PWM.new( parser.parse )
105
+ input_second = parser.parse
104
106
  else
105
107
  raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
106
- pwm_second = Bioinform::PWM.new(File.read(second_file))
108
+ input_second = File.read(second_file)
107
109
  end
110
+ pwm_second = data_model.new(input_second).to_pwm
108
111
 
109
112
  pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
110
113
  pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
@@ -47,7 +47,7 @@ module Macroape
47
47
  max_hash_size = 1000000
48
48
  max_pair_hash_size = 1000
49
49
 
50
-
50
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
51
51
  first_file = argv.shift
52
52
  second_file = argv.shift
53
53
  raise "You'd specify two input sources (each is filename or .stdin)" unless first_file and second_file
@@ -76,18 +76,20 @@ module Macroape
76
76
  parser = Bioinform::StringParser.new($stdin.read) if first_file == '.stdin' || second_file == '.stdin'
77
77
 
78
78
  if first_file == '.stdin'
79
- pwm_first = Bioinform::PWM.new( parser.parse )
79
+ input_first = parser.parse
80
80
  else
81
81
  raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
82
- pwm_first = Bioinform::PWM.new(File.read(first_file))
82
+ input_first = File.read(first_file)
83
83
  end
84
+ pwm_first = data_model.new(input_first).to_pwm
84
85
 
85
86
  if second_file == '.stdin'
86
- pwm_second = Bioinform::PWM.new( parser.parse )
87
+ input_second = parser.parse
87
88
  else
88
89
  raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
89
- pwm_second = Bioinform::PWM.new(File.read(second_file))
90
+ input_second = File.read(second_file)
90
91
  end
92
+ pwm_second = data_model.new(input_second).to_pwm
91
93
 
92
94
  pwm_first.background!(first_background).max_hash_size!(max_hash_size).discrete!(discretization)
93
95
  pwm_second.background!(second_background).max_hash_size!(max_hash_size).discrete!(discretization)
@@ -41,6 +41,7 @@ module Macroape
41
41
  thresholds = []
42
42
  max_hash_size = 1000000
43
43
 
44
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
44
45
  filename = argv.shift
45
46
 
46
47
  loop do
@@ -68,11 +69,12 @@ module Macroape
68
69
 
69
70
 
70
71
  if filename == '.stdin'
71
- pwm = Bioinform::PWM.new( $stdin.read )
72
+ input = $stdin.read
72
73
  else
73
74
  raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
74
- pwm = Bioinform::PWM.new( File.read(filename) )
75
+ input = File.read(filename)
75
76
  end
77
+ pwm = data_model.new(input).to_pwm
76
78
  pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
77
79
 
78
80
  counts = pwm.counts_by_thresholds(* thresholds.map{|count| count * discretization})
@@ -36,6 +36,7 @@ module Macroape
36
36
  default_pvalues = [0.0005]
37
37
  discretization = 10000
38
38
  max_hash_size = 1000000
39
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
39
40
 
40
41
  filename = argv.shift
41
42
  raise "No input. You'd specify input source: filename or .stdin" unless filename
@@ -63,12 +64,12 @@ module Macroape
63
64
  pvalues = default_pvalues if pvalues.empty?
64
65
 
65
66
  if filename == '.stdin'
66
- pwm = Bioinform::PWM.new( $stdin.read )
67
+ input = $stdin.read
67
68
  else
68
69
  raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
69
- pwm = Bioinform::PWM.new( File.read(filename) )
70
+ input = File.read(filename)
70
71
  end
71
-
72
+ pwm = data_model.new(input).to_pwm
72
73
  pwm.background!(background).max_hash_size!(max_hash_size).discrete!(discretization)
73
74
 
74
75
  pwm.thresholds(*pvalues) do |pvalue, threshold, real_pvalue|
@@ -74,7 +74,7 @@ module Macroape
74
74
  collection = Macroape::Collection.new(rough_discretization, precise_discretization, background, pvalues)
75
75
 
76
76
  if File.directory?(data_source)
77
- motifs = Dir.glob(File.join(data_source,'*')).map do |filename|
77
+ motifs = Dir.glob(File.join(data_source,'*')).map do |filename|
78
78
  pwm = data_model.new(File.read(filename))
79
79
  pwm.name ||= File.basename(filename, File.extname(filename))
80
80
  pwm
@@ -36,6 +36,7 @@ module Macroape
36
36
  exit
37
37
  end
38
38
 
39
+ data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM
39
40
  filename = argv.shift
40
41
  collection_file = argv.shift
41
42
  raise "No input. You'd specify input source for pat: filename or .stdin" unless filename
@@ -82,12 +83,13 @@ module Macroape
82
83
  raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
83
84
 
84
85
  if filename == '.stdin'
85
- query_pwm = Bioinform::PWM.new( $stdin.read )
86
+ query_input = $stdin.read
86
87
  else
87
88
  raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
88
- query_pwm = Bioinform::PWM.new(File.read(filename))
89
+ query_input = File.read(filename)
89
90
  end
90
91
 
92
+ query_pwm = data_model.new(query_input).to_pwm
91
93
  query_pwm.background(background_query).max_hash_size(max_hash_size)
92
94
 
93
95
  query_pwm_rough = query_pwm.discrete(collection.rough_discretization)
@@ -1,3 +1,3 @@
1
1
  module Macroape
2
- VERSION = "3.3.4"
2
+ VERSION = "3.3.5"
3
3
  end
@@ -1,6 +1,10 @@
1
1
  require 'test_helper'
2
2
 
3
3
  class TestEvalAlignment < Test::Unit::TestCase
4
+ def test_process_pcm_files
5
+ assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_alignment_output('test/data/KLF4_f2.pcm test/data/SP1_f1.pcm -1 direct --pcm')
6
+ end
7
+
4
8
  def test_process_at_optimal_alignment
5
9
  assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_alignment_output('test/data/KLF4_f2.pat test/data/SP1_f1.pat -1 direct')
6
10
  end
@@ -1,6 +1,9 @@
1
1
  require 'test_helper'
2
2
 
3
3
  class TestEvalSimilarity < Test::Unit::TestCase
4
+ def test_process_pair_of_pcms
5
+ assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_similarity_output('test/data/KLF4_f2.pcm test/data/SP1_f1.pcm --pcm')
6
+ end
4
7
  def test_process_pair_of_pwms
5
8
  assert_equal "0.2420758234928527\n779.0\t11\n.>>>>>>>>>>\n>>>>>>>>>>>\n-1\tdirect\n", Helpers.eval_similarity_output('test/data/KLF4_f2.pat test/data/SP1_f1.pat')
6
9
  end
@@ -1,6 +1,9 @@
1
1
  require 'test_helper'
2
2
 
3
3
  class FindPvalueTest < Test::Unit::TestCase
4
+ def test_process_pcm
5
+ assert_equal "4.1719\t1048.0\t0.00099945068359375\n", Helpers.find_pvalue_output('test/data/KLF4_f2.pcm 4.1719 --pcm')
6
+ end
4
7
  def test_process_one_threshold
5
8
  assert_equal "4.1719\t1048.0\t0.00099945068359375\n", Helpers.find_pvalue_output('test/data/KLF4_f2.pat 4.1719')
6
9
  end
@@ -10,6 +10,11 @@ class FindThresholdTest < Test::Unit::TestCase
10
10
  }
11
11
  assert_equal pvalues, ['0.0005', '0.001']
12
12
  end
13
+ def test_process_pcm
14
+ pvalue, threshold, real_pvalue = Helpers.find_threshold_output('test/data/KLF4_f2.pcm -p 0.001 --pcm').strip.split("\t")
15
+ assert_equal '0.001', pvalue
16
+ assert_equal Helpers.obtain_pvalue_by_threshold("test/data/KLF4_f2.pat #{threshold}"), real_pvalue
17
+ end
13
18
  def test_process_one_pvalue
14
19
  pvalue, threshold, real_pvalue = Helpers.find_threshold_output('test/data/KLF4_f2.pat -p 0.001').strip.split("\t")
15
20
  assert_equal '0.001', pvalue
@@ -1,6 +1,10 @@
1
1
  require 'test_helper'
2
2
 
3
3
  class TestScanCollection < Test::Unit::TestCase
4
+ def test_scan_pcm
5
+ assert_equal File.read('test/data/KLF4_f2_scan_results_default_cutoff.txt').gsub("\r\n", "\n"),
6
+ Helpers.scan_collection_output('test/data/KLF4_f2.pcm test/data/test_collection.yaml --silent --pcm').gsub("\r\n","\n")
7
+ end
4
8
  def test_scan_default_cutoff
5
9
  assert_equal File.read('test/data/KLF4_f2_scan_results_default_cutoff.txt').gsub("\r\n", "\n"),
6
10
  Helpers.scan_collection_output('test/data/KLF4_f2.pat test/data/test_collection.yaml --silent').gsub("\r\n","\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: macroape
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.4
4
+ version: 3.3.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-01 00:00:00.000000000 Z
12
+ date: 2012-09-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bioinform