bioinform 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,13 +11,13 @@ module Bioinform
11
11
  position.each_index.inject(0) { |sum,letter| sum + position[letter] * probability[letter] }**2
12
12
  end
13
13
  end
14
-
14
+
15
15
  def threshold_gauss_estimation(pvalue)
16
16
  sigma = Math.sqrt(score_variance)
17
17
  n_ = Math.inverf(1 - 2 * pvalue) * Math.sqrt(2)
18
18
  score_mean + n_ * sigma
19
19
  end
20
-
20
+
21
21
  def score(word)
22
22
  word = word.upcase
23
23
  raise ArgumentError, 'word in PWM#score(word) should have the same length as matrix' unless word.length == length
@@ -32,7 +32,7 @@ module Bioinform
32
32
  end
33
33
  end.inject(&:+)
34
34
  end
35
-
35
+
36
36
  def to_pwm
37
37
  self
38
38
  end
@@ -4,7 +4,7 @@ require 'bioinform/data_models/pm'
4
4
  module Bioinform
5
5
  class Parser
6
6
  attr_reader :input
7
-
7
+
8
8
  def initialize(*input)
9
9
  if input.size == 1 # [ [1,2,3,4] ], [ [[1,2,3,4],[5,6,7,8]] ]
10
10
  if input.first.is_a?(Array) && input.first.all?{|el| el.is_a? Numeric} # [ [1,2,3,4] ]
@@ -16,21 +16,21 @@ module Bioinform
16
16
  @input = input
17
17
  end
18
18
  end
19
-
19
+
20
20
  def parse!
21
21
  matrix = self.class.transform_input(input)
22
22
  raise 'Parsing error' unless self.class.valid_matrix?(matrix)
23
23
  {matrix: matrix}
24
24
  end
25
-
25
+
26
26
  def parse
27
27
  parse! rescue nil
28
28
  end
29
-
29
+
30
30
  def self.choose(input, data_model = PM)
31
31
  data_model.choose_parser(input).new(input)
32
32
  end
33
-
33
+
34
34
  def self.parse!(*input)
35
35
  self.new(*input).parse!
36
36
  end
@@ -41,7 +41,7 @@ module Bioinform
41
41
  def self.valid_matrix?(matrix)
42
42
  PM.valid_matrix?(matrix)
43
43
  end
44
-
44
+
45
45
  # {A: 1, C: 2, G: 3, T: 4} --> [1,2,3,4]
46
46
  # {A: [1,2], C: [3,4], G: [5,6], T: [7,8]} --> [[1,3,5,7],[2,4,6,8]] ( == [[1,2], [3,4], [5,6], [7,8]].transpose)
47
47
  def self.array_from_acgt_hash(hsh)
@@ -50,12 +50,12 @@ module Bioinform
50
50
  result = [:A,:C,:G,:T].collect{|letter| hsh[letter] }
51
51
  result.all?{|el| el.is_a?(Array)} ? result.transpose : result
52
52
  end
53
-
53
+
54
54
  # {a: 1, C: 2, 'g' => 3, 'T' => 4} --> {A: 1, C: 2, G: 3, T: 4}
55
55
  def self.normalize_hash_keys(hsh)
56
56
  hsh.collect_hash{|key,value| [key.to_s.upcase.to_sym, value] }
57
57
  end
58
-
58
+
59
59
  # [[1,2,3,4], [2,3,4,5]] --> [[1,2,3,4], [2,3,4,5]]
60
60
  # [{A:1, C:2, G:3, T:4}, {A:2, C:3, G:4, T:5}] --> [{A:1, C:2, G:3, T:4}, {A:2, C:3, G:4, T:5}]
61
61
  # {:A => [1,2,3], :c => [2,3,4], 'g' => [3,4,5], 'T' => [4,5,6]} --> [[1,2,3],[2,3,4],[3,4,5],[4,5,6]].transpose
@@ -66,18 +66,18 @@ module Bioinform
66
66
  else raise TypeError, 'input of Bioinform::Parser::array_from_acgt_hash should be Array or Hash'
67
67
  end
68
68
  end
69
-
69
+
70
70
  def self.transform_input(input)
71
71
  result = try_convert_to_array(input).map{|el| try_convert_to_array(el)}
72
72
  need_tranpose?(result) ? result.transpose : result
73
73
  end
74
-
74
+
75
75
  # point whether matrix input positions(need not be transposed -- false) or letters(need -- true) as first index
76
76
  # [[1,3,5,7], [2,4,6,8]] --> false
77
77
  # [[1,2],[3,4],[5,6],[7,8]] --> true
78
78
  def self.need_tranpose?(input)
79
79
  (input.size == 4) && input.any?{|x| x.size != 4}
80
80
  end
81
-
81
+
82
82
  end
83
83
  end
@@ -4,11 +4,32 @@ require 'bioinform/parsers/string_parser'
4
4
  module Bioinform
5
5
  class StringFantomParser < StringParser
6
6
  def header_pat
7
- /NA (?<name>[\w.+:-]+)\n[\w\d]+ A C G T\n/
7
+ /NA (?<name>[\w.+:-]+)\n[\w\d]+ A C G T.*\n/
8
8
  end
9
-
9
+
10
10
  def row_pat
11
11
  /[\w\d]+ (?<row>(#{number_pat} )*#{number_pat})\n?/
12
12
  end
13
+
14
+ def scan_splitter
15
+ scanner.scan(/(\/\/\n)+/)
16
+ end
17
+
18
+ def parse_matrix
19
+ matrix = []
20
+ while row_string = scan_row
21
+ matrix << split_row(row_string)[0,4]
22
+ end
23
+ matrix.transpose
24
+ end
25
+
26
+ def parse!
27
+ scan_any_spaces
28
+ scan_splitter
29
+ name = parse_name
30
+ matrix = parse_matrix
31
+ Parser.parse!(matrix).merge(name: name)
32
+ end
33
+
13
34
  end
14
35
  end
@@ -2,7 +2,7 @@ require 'strscan'
2
2
  require 'bioinform/support'
3
3
  require 'bioinform/parsers/parser'
4
4
 
5
- module Bioinform
5
+ module Bioinform
6
6
  class StringParser < Parser
7
7
  attr_reader :scanner, :row_acgt_markers
8
8
  def initialize(input)
@@ -10,40 +10,40 @@ module Bioinform
10
10
  super
11
11
  @scanner = StringScanner.new(input.multiline_squish)
12
12
  end
13
-
13
+
14
14
  def number_pat
15
15
  /[+-]?\d+(\.\d+)?([eE][+-]?\d{1,3})?/
16
16
  end
17
-
17
+
18
18
  def header_pat
19
19
  />?\s*(?<name>\S+)\n/
20
20
  end
21
-
22
- def row_pat
21
+
22
+ def row_pat
23
23
  /([ACGT]\s*[:|]?\s*)?(?<row>(#{number_pat} )*#{number_pat})\n?/
24
24
  end
25
-
25
+
26
26
  def scan_row
27
27
  match = scanner.advanced_scan(row_pat)
28
28
  match && match[:row]
29
29
  end
30
-
30
+
31
31
  def split_row(row_string)
32
32
  row_string.split.map(&:to_f)
33
33
  end
34
-
34
+
35
35
  def scan_any_spaces
36
36
  scanner.scan(/\s+/)
37
37
  end
38
-
38
+
39
39
  def parse_name
40
40
  match = scanner.advanced_scan(header_pat)
41
41
  match && match[:name]
42
42
  end
43
-
43
+
44
44
  def parse_matrix
45
45
  matrix = []
46
- @row_acgt_markers = true if scanner.check(/A.*\nC.*\nG.*\nT.*\n?/)
46
+ @row_acgt_markers = true if scanner.check(/A.*\nC.*\nG.*\nT.*\n?/)
47
47
  while row_string = scan_row
48
48
  matrix << split_row(row_string)
49
49
  end
@@ -53,7 +53,7 @@ module Bioinform
53
53
  def parse_acgt_header
54
54
  scanner.scan(/A\s*C\s*G\s*T\s*\n/i)
55
55
  end
56
-
56
+
57
57
  def parse!
58
58
  scan_any_spaces
59
59
  name = parse_name
@@ -62,11 +62,11 @@ module Bioinform
62
62
  matrix = matrix.transpose if row_acgt_markers
63
63
  Parser.parse!(matrix).merge(name: name)
64
64
  end
65
-
65
+
66
66
  def scanner_reset
67
67
  scanner.reset
68
68
  end
69
-
69
+
70
70
  def each
71
71
  if block_given?
72
72
  scanner_reset
@@ -78,12 +78,12 @@ module Bioinform
78
78
  end
79
79
  end
80
80
  include Enumerable
81
-
81
+
82
82
  alias_method :split, :to_a
83
83
  def self.split(input)
84
84
  self.new(input).split
85
85
  end
86
-
86
+
87
87
  def self.split_on_motifs(input, pm_klass = PM)
88
88
  split(input).map{|el| pm_klass.new(el)}
89
89
  end
@@ -2,6 +2,6 @@ module Enumerable
2
2
  # %w{A C G T}.collect_hash{|k| [k*2, k*3] }
3
3
  # # ==> {"AA" => "AAA", "CC" => "CCC", "GG" => "GGG", "TT" => "TTT"}
4
4
  def collect_hash(&block)
5
- block_given? ? Hash[ collect(&block) ] : Hash[ collect{|k,v| [k,v]} ]
5
+ block_given? ? Hash[ collect(&block) ] : Hash[ collect{|k,v| [k,v]} ]
6
6
  end
7
7
  end
@@ -1,3 +1,3 @@
1
1
  module Bioinform
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.7"
3
3
  end
@@ -0,0 +1,11 @@
1
+ KLF4_f2
2
+ 1233.46088405354 93.18173277811673 1036.6014857092885 1258.2948629970272
3
+ 263.979242343185 5.314520555872139 3347.5949971525274 4.650205486388122
4
+ 76.7700780003465 6.643150694840173 3529.4896409394937 8.636095903292224
5
+ 57.86097393406657 18.102585643439472 3520.3342027139347 25.24120324653207
6
+ 518.1947904009378 1545.9062946905135 22.396758181071043 1535.0411222654507
7
+ 137.98151691820345 9.300410972776241 3456.320530770924 17.936506876068467
8
+ 115.27647661640499 81.51802997128804 1861.9425868567278 1562.801872093553
9
+ 227.8095486111286 42.84555258785854 3278.6396005325996 72.244263806387
10
+ 108.73384179997886 134.47328134862394 3162.880454846513 215.45138754285665
11
+ 238.49636899561344 2225.9561104691043 402.40727964384774 754.6792064294074
@@ -0,0 +1,11 @@
1
+ KLF4_f2
2
+ 1233.46088405354 93.18173277811673 1036.6014857092885 1258.2948629970272
3
+ 263.979242343185 5.314520555872139 3347.5949971525274 4.650205486388122
4
+ 76.7700780003465 6.643150694840173 3529.4896409394937 8.636095903292224
5
+ 57.86097393406657 18.102585643439472 3520.3342027139347 25.24120324653207
6
+ 518.1947904009378 1545.9062946905135 22.396758181071043 1535.0411222654507
7
+ 137.98151691820345 9.300410972776241 3456.320530770924 17.936506876068467
8
+ 115.27647661640499 81.51802997128804 1861.9425868567278 1562.801872093553
9
+ 227.8095486111286 42.84555258785854 3278.6396005325996 72.244263806387
10
+ 108.73384179997886 134.47328134862394 3162.880454846513 215.45138754285665
11
+ 238.49636899561344 2225.9561104691043 402.40727964384774 754.6792064294074
@@ -0,0 +1,11 @@
1
+ KLF4_f2
2
+ 0.30861857265872605 -2.254321000121579 0.13505703522674192 0.3285194224375633
3
+ -1.227018967707036 -4.814127713368663 1.3059890687390967 -4.908681463544344
4
+ -2.443469374521196 -4.648238485031404 1.3588686548279805 -4.441801801188402
5
+ -2.7177827948276123 -3.8073538975356565 1.356272809724262 -3.504104725510225
6
+ -0.5563232977367343 0.5340697765121405 -3.61417723090579 0.5270259776377405
7
+ -1.8687622060887386 -4.381483976582316 1.337932245336098 -3.815629658877517
8
+ -2.045671123823928 -2.384975142213679 0.7198551207724355 0.5449254135616948
9
+ -1.373157530374372 -3.0063112097748217 1.285188335493552 -2.5026044231773543
10
+ -2.1030513122772208 -1.8941348100402244 1.249265758393991 -1.4284210948906104
11
+ -1.3277128628152939 0.8982415633049462 -0.8080773665408135 -0.18161647647456935
@@ -0,0 +1,12 @@
1
+ SP1_f1
2
+ 682.6436366358055 443.1455214015781 2075.655346294993 287.211468117951
3
+ 299.8883246804867 103.74338315843572 2613.8927022405364 471.1315623708902
4
+ 591.4892493324709 42.631827541794564 2845.1654083148564 9.36948726124641
5
+ 7.071084742361592 45.29093411231232 3432.8847704374107 3.409183158303573
6
+ 91.308984085713 19.1536481364332 3373.656949880137 4.5363903481026
7
+ 809.2082973387932 2246.941954176211 61.30766021687515 371.19806071846244
8
+ 120.56476435866055 42.4349244403591 3242.1560628684038 83.50022078295852
9
+ 13.72524477409959 35.858220519297525 3332.4066864946167 106.66582066236779
10
+ 558.1188080161639 90.0084504200356 2694.854973210736 145.67374080342415
11
+ 264.0088462230318 254.7175868081866 2796.88087480315 173.0486646159857
12
+ 519.46013914282 1874.9349086474765 654.5411208373813 439.7198038226514
@@ -0,0 +1,12 @@
1
+ SP1_f1
2
+ -0.24435707885585292 -0.674823404693731 0.8657012535789866 -1.1060188862599287
3
+ -1.0631255752097797 -2.111925969423868 1.0960627561110403 -0.6138563775211977
4
+ -0.3872276234760535 -2.9739851913218045 1.1807800242010378 -4.338927525031566
5
+ -4.563896055436894 -2.9161633002532277 1.3684371349982638 -5.077972423609655
6
+ -2.2369752892820083 -3.7196436313301846 1.3510439136452734 -4.889930670508233
7
+ -0.07473964149330865 0.944919654762011 -2.6246857648086044 -0.8510983487822436
8
+ -1.9643526491643322 -2.978402770880115 1.3113096718240573 -2.324334259499025
9
+ -4.0155484139655835 -3.1384268078096667 1.3387488589788057 -2.084673903537648
10
+ -0.44509385828355363 -2.2510053061629702 1.1265431574368685 -1.7780413702431372
11
+ -1.1896356092245048 -1.2251832285630027 1.1636760063747527 -1.6080243648157353
12
+ -0.5166047365590571 0.7641033353626657 -0.2862677570028208 -0.68254820978656
@@ -1,6 +1,80 @@
1
1
  require 'spec_helper'
2
+ require 'shellwords'
2
3
  require 'bioinform/cli/pcm2pwm'
3
4
 
5
+
6
+ def run_pcm2pwm(cmd)
7
+ Bioinform::CLI::PCM2PWM.main(cmd.shellsplit)
8
+ end
9
+
4
10
  describe Bioinform::CLI::PCM2PWM do
11
+ before :each do
12
+ @start_dir = Dir.pwd
13
+ Dir.chdir File.join(File.dirname(__FILE__), 'data')
14
+ end
15
+ after :each do
16
+ File.delete('KLF4_f2.pwm') if File.exist?('KLF4_f2.pwm')
17
+ File.delete('SP1_f1.pwm') if File.exist?('SP1_f1.pwm')
18
+ File.delete('KLF4_f2.pat') if File.exist?('KLF4_f2.pat')
19
+ File.delete('KLF4 f2 spaced name.pwm') if File.exist?('KLF4 f2 spaced name.pwm')
20
+ FileUtils.rm_rf('pwm_folder') if Dir.exist?('pwm_folder')
21
+ Dir.chdir(@start_dir)
22
+ end
23
+
24
+ it 'should transform single PCM to PWM' do
25
+ run_pcm2pwm('KLF4_f2.pcm')
26
+ File.exist?('KLF4_f2.pwm').should be_true
27
+ File.read('KLF4_f2.pwm').should == File.read('KLF4_f2.pwm.result')
28
+ end
29
+
30
+ it 'should transform multiple PCMs to PWMs' do
31
+ run_pcm2pwm('KLF4_f2.pcm SP1_f1.pcm')
32
+
33
+ File.exist?('KLF4_f2.pwm').should be_true
34
+ File.read('KLF4_f2.pwm').should == File.read('KLF4_f2.pwm.result')
35
+
36
+ File.exist?('SP1_f1.pwm').should be_true
37
+ File.read('SP1_f1.pwm').should == File.read('SP1_f1.pwm.result')
38
+ end
39
+
40
+ it 'should transform extension to specified with --extension option' do
41
+ run_pcm2pwm('KLF4_f2.pcm --extension=pat')
42
+ File.exist?('KLF4_f2.pat').should be_true
43
+ File.read('KLF4_f2.pat').should == File.read('KLF4_f2.pwm.result')
44
+ end
45
+
46
+ it 'should save PWMs into folder specified with --folder option when folder exists' do
47
+ Dir.mkdir('pwm_folder') unless Dir.exist?('pwm_folder')
48
+ run_pcm2pwm('KLF4_f2.pcm --folder=pwm_folder')
49
+ File.exist?('pwm_folder/KLF4_f2.pwm').should be_true
50
+ File.read('pwm_folder/KLF4_f2.pwm').should == File.read('KLF4_f2.pwm.result')
51
+ end
52
+ it 'should save PWMs into folder specified with --folder option' do
53
+ FileUtils.rm_rf('pwm_folder') if Dir.exist?('pwm_folder')
54
+ run_pcm2pwm('KLF4_f2.pcm --folder=pwm_folder')
55
+ File.exist?('pwm_folder/KLF4_f2.pwm').should be_true
56
+ File.read('pwm_folder/KLF4_f2.pwm').should == File.read('KLF4_f2.pwm.result')
57
+ end
58
+
59
+ it 'should process PCMs with names obtained from STDIN' do
60
+ provide_stdin('KLF4_f2.pcm SP1_f1.pcm') { run_pcm2pwm('') }
61
+ File.exist?('KLF4_f2.pwm').should be_true
62
+ File.read('KLF4_f2.pwm').should == File.read('KLF4_f2.pwm.result')
63
+
64
+ File.exist?('SP1_f1.pwm').should be_true
65
+ File.read('SP1_f1.pwm').should == File.read('SP1_f1.pwm.result')
66
+ end
67
+
68
+ it 'should process PCMs with names obtained from STDIN when there are some options' do
69
+ provide_stdin('KLF4_f2.pcm') { run_pcm2pwm('-e pat') }
70
+ File.exist?('KLF4_f2.pat').should be_true
71
+ File.read('KLF4_f2.pat').should == File.read('KLF4_f2.pwm.result')
72
+ end
73
+
74
+ it 'should process PCMs having filename with spaces' do
75
+ run_pcm2pwm('"KLF4 f2 spaced name.pcm"')
76
+ File.exist?('KLF4 f2 spaced name.pwm').should be_true
77
+ File.read('KLF4 f2 spaced name.pwm').should == File.read('KLF4_f2.pwm.result')
78
+ end
5
79
 
6
80
  end
@@ -0,0 +1,96 @@
1
+ require 'spec_helper'
2
+ require 'bioinform/data_models/collection'
3
+
4
+ module Bioinform
5
+ describe Collection do
6
+ before :each do
7
+ @collection = Collection.new(name: 'Main collection')
8
+ @pm_1 = PM.new(matrix:[[1,1,1,1]],name:'Stub datamodel')
9
+ @pm_2 = PM.new(matrix:[[1,2,3,4],[4,3,2,1]],name:'Second stub')
10
+ @pm_3 = PM.new(matrix:[[11,12,13,14],[41,31,21,11]],name:'Third stub')
11
+ end
12
+ describe '#size' do
13
+ it 'should return size of collection' do
14
+ @collection << @pm_1 << @pm_2 << @pm_3
15
+ @collection.size.should == 3
16
+ end
17
+ end
18
+ describe '#<<' do
19
+ it 'should add element to collection' do
20
+ @collection << @pm_1
21
+ @collection << @pm_2
22
+ @collection << @pm_3
23
+ @collection.collection.map(&:first).should include(@pm_1, @pm_2, @pm_3)
24
+ end
25
+ it 'should be chainable' do
26
+ @collection << @pm_1 << @pm_2 << @pm_3
27
+ @collection.collection.map(&:first).should include(@pm_1, @pm_2, @pm_3)
28
+ end
29
+ it 'should mark motif with name' do
30
+ @collection << @pm_1 << @pm_2
31
+ @pm_1.should be_tagged('Main collection')
32
+ @pm_2.should be_tagged('Main collection')
33
+ end
34
+ it 'should mark motif with self' do
35
+ @collection << @pm_1 << @pm_2
36
+ @pm_1.should be_tagged(@collection)
37
+ @pm_2.should be_tagged(@collection)
38
+ end
39
+ end
40
+
41
+ describe '#each_pm' do
42
+ before :each do
43
+ @collection << @pm_1 << @pm_2 << @pm_3
44
+ end
45
+ context 'with block given' do
46
+ it 'should yield elements of collecton' do
47
+ expect{|b| @collection.each_pm(&b)}.to yield_successive_args(@pm_1, @pm_2, @pm_3)
48
+ end
49
+ end
50
+ context 'with block given' do
51
+ it 'return an Enumerator' do
52
+ @collection.each_pm.should be_kind_of(Enumerator)
53
+ end
54
+ end
55
+ end
56
+ describe '#each_pcm' do
57
+ before :each do
58
+ @collection << @pm_1 << @pm_2 << @pm_3
59
+ end
60
+ context 'with block given' do
61
+ it 'should yield elements of collecton converted to pcm' do
62
+ expect{|b| @collection.each_pcm(&b)}.to yield_successive_args(PCM, PCM, PCM)
63
+ end
64
+ end
65
+ context 'with block given' do
66
+ it 'return an Enumerator' do
67
+ @collection.each_pcm.should be_kind_of(Enumerator)
68
+ end
69
+ end
70
+ end
71
+
72
+ describe '#+' do
73
+ before :each do
74
+ @collection << @pm_1 << @pm_2 << @pm_3
75
+ @pm_sec_1 = PM.new(matrix: [[1,0,1,0],[0,0,0,0],[1,2,3,4]], name: 'Secondary collection matrix 1')
76
+ @pm_sec_2 = PM.new(matrix: [[1,2,1,2],[0,3,6,9],[1,2,3,4]], name: 'Secondary collection matrix 2')
77
+ @secondary_collection = Collection.new(name: 'Secondary collection')
78
+ @secondary_collection << @pm_sec_1 << @pm_sec_2
79
+ @summary_collection = @collection + @secondary_collection
80
+ end
81
+ it 'should create a collection consisting of all elements of both collections' do
82
+ @summary_collection.should be_kind_of(Collection)
83
+ @summary_collection.size.should == (@collection.size + @secondary_collection.size)
84
+ @summary_collection.collection.map(&:first).should include(@pm_1, @pm_2, @pm_3, @pm_sec_1, @pm_sec_2)
85
+ end
86
+ it 'should leave marks on motifs' do
87
+ @pm_1.should be_tagged('Main collection')
88
+ @pm_sec_1.should be_tagged('Secondary collection')
89
+ end
90
+ it 'should not mix marks of motifs in different collections' do
91
+ @pm_1.should_not be_tagged('Secondary collection')
92
+ @pm_sec_1.should_not be_tagged('Main collection')
93
+ end
94
+ end
95
+ end
96
+ end