bio-alignment 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/Gemfile +5 -4
  2. data/README.md +94 -9
  3. data/Rakefile +2 -1
  4. data/VERSION +1 -1
  5. data/doc/bio-alignment-design.md +75 -11
  6. data/features/bioruby-feature.rb +17 -0
  7. data/features/bioruby.feature +6 -1
  8. data/features/columns-feature.rb +2 -0
  9. data/features/edit/del_bridges-feature.rb +7 -3
  10. data/features/edit/del_bridges.feature +1 -2
  11. data/features/edit/del_non_informative_sequences-feature.rb +26 -0
  12. data/features/edit/del_non_informative_sequences.feature +19 -0
  13. data/features/edit/del_short_sequences-feature.rb +21 -0
  14. data/features/edit/del_short_sequences.feature +25 -0
  15. data/features/edit/gblocks-feature.rb +2 -2
  16. data/features/edit/mask_islands-feature.rb +17 -4
  17. data/features/edit/mask_islands.feature +28 -17
  18. data/features/edit/mask_serial_mutations-feature.rb +8 -6
  19. data/features/edit/mask_serial_mutations.feature +11 -11
  20. data/features/tree-feature.rb +66 -0
  21. data/features/tree.feature +45 -0
  22. data/lib/bio-alignment.rb +4 -1
  23. data/lib/bio-alignment/alignment.rb +58 -3
  24. data/lib/bio-alignment/codonsequence.rb +14 -2
  25. data/lib/bio-alignment/columns.rb +102 -0
  26. data/lib/bio-alignment/edit/del_bridges.rb +18 -1
  27. data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
  28. data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
  29. data/lib/bio-alignment/edit/edit_columns.rb +22 -0
  30. data/lib/bio-alignment/edit/edit_rows.rb +49 -0
  31. data/lib/bio-alignment/edit/mask_islands.rb +115 -0
  32. data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
  33. data/lib/bio-alignment/elements.rb +86 -0
  34. data/lib/bio-alignment/rows.rb +52 -0
  35. data/lib/bio-alignment/sequence.rb +20 -14
  36. data/lib/bio-alignment/state.rb +64 -8
  37. data/lib/bio-alignment/tree.rb +77 -0
  38. data/spec/bio-alignment_spec.rb +57 -1
  39. data/spec/spec_helper.rb +3 -3
  40. metadata +47 -22
  41. data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,102 @@
1
+ require 'bio-alignment/state'
2
+
3
+ module Bio
4
+
5
+ module BioAlignment
6
+
7
+ # The Columns module provides accessors for the column list
8
+ # returning Column objects
9
+ module Columns
10
+
11
+ # Return a list of Column objects. The contents of the
12
+ # columns are accessed lazily
13
+ def columns
14
+ @columns ||= (0..num_columns-1).map { | col | Column.new(self,col) }
15
+ end
16
+
17
+ # def columns= list
18
+ # @columns = list
19
+ # end
20
+
21
+ def num_columns
22
+ rows.first.length
23
+ end
24
+
25
+ # Return an alignment which match columns. The originating
26
+ # sequences should have methods 'empty_copy' and '<<'
27
+ def columns_where &block
28
+ seqs = []
29
+ rows.each do | seq |
30
+ new_seq = seq.empty_copy
31
+ seq.each_with_index do | e,i |
32
+ new_seq << e if block.call(columns[i])
33
+ end
34
+ seqs << new_seq
35
+ end
36
+ Alignment.new(seqs)
37
+ end
38
+
39
+ def columns_to_s
40
+ columns.map { |c| (c.state ? c.state.to_s : '?') }.join
41
+ end
42
+
43
+ def clone_columns!
44
+ # clone the columns
45
+ old_columns = @columns
46
+ @columns = []
47
+ old_columns.each do | old_column |
48
+ @columns << old_column.clone
49
+ end
50
+ end
51
+ end
52
+
53
+ # Support the notion of columns in an alignment. A column
54
+ # can have state by attaching state objects
55
+ class Column
56
+ include State
57
+ include Enumerable
58
+
59
+ def initialize aln, col
60
+ @aln = aln
61
+ @col = col
62
+ end
63
+
64
+ def [] index
65
+ @aln[index][@col]
66
+ end
67
+
68
+ # iterator fetches a column on demand, yielding column elements
69
+ def each
70
+ @aln.each do | seq |
71
+ yield seq[@col]
72
+ end
73
+ end
74
+
75
+ def length
76
+ @length ||= @aln.rows.size
77
+ end
78
+
79
+ def count &block
80
+ counter = 0
81
+ each do | e |
82
+ found =
83
+ if e.kind_of?(String)
84
+ block.call(Element.new(e))
85
+ else
86
+ block.call(e)
87
+ end
88
+ counter += 1 if found
89
+ end
90
+ counter
91
+ end
92
+
93
+ def to_s
94
+ map{|e| e.to_s}.join('')
95
+ end
96
+
97
+ end
98
+
99
+ end
100
+
101
+ end
102
+
@@ -1,10 +1,27 @@
1
+ require 'bio-alignment/edit/edit_columns'
1
2
 
2
3
  module Bio
3
4
  module BioAlignment
4
5
 
5
6
  module DelBridges
7
+ include MarkColumns
6
8
 
7
- def clean
9
+ # Return a new alignment with columns marked for deletion, i.e. mark
10
+ # columns that mostly contain gaps (threshold +percentage+). The
11
+ # alignment returned is a cloned copy
12
+ def mark_bridges percentage = 30
13
+ mark_columns { |state,column|
14
+ num = column.count { |e| e.gap? or e.undefined? }
15
+ if (num.to_f/column.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ # Return an alignment with the bridges removed
23
+ def del_bridges percentage=30
24
+ mark_bridges.columns_where { |col| !col.state.deleted? }
8
25
  end
9
26
  end
10
27
  end
@@ -0,0 +1,27 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module DelNonInformativeSequences
7
+ include MarkRows
8
+
9
+ # Return a new alignment with rows marked for deletion, i.e. mark rows
10
+ # that mostly contain undefined elements and gaps (threshold
11
+ # +percentage+). The alignment returned is a cloned copy
12
+ def mark_non_informative_sequences percentage = 30
13
+ mark_rows { |state,row|
14
+ num = row.count { |e| e.gap? or e.undefined? }
15
+ if (num.to_f/row.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ def del_non_informative_sequences percentage=30
23
+ mark_non_informative_sequences.rows_where { |row| !row.state.deleted? }
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module DelShortSequences
7
+ include MarkRows
8
+
9
+ # Return a new alignment with rows marked for deletion, i.e. mark
10
+ # rows that mostly contain gaps (threshold +percentage+). The
11
+ # alignment returned is a cloned copy
12
+ def mark_short_sequences percentage = 30
13
+ mark_rows { |state,row|
14
+ num = row.count { |e| e.gap? }
15
+ if (num.to_f/row.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ # Return an alignment with the bridges removed
23
+ def del_short_sequences percentage=30
24
+ mark_short_sequences.rows_where { |row| !row.state.deleted? }
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ module Bio
2
+ module BioAlignment
3
+
4
+ module MarkColumns
5
+ def mark_columns &block
6
+ aln = self.clone
7
+ # clone column state
8
+ aln.columns.each do | column |
9
+ new_state =
10
+ if column.state
11
+ column.state.clone
12
+ else
13
+ ColumnState.new
14
+ end
15
+ column.state = block.call(new_state,column)
16
+ end
17
+ aln
18
+ end
19
+ end
20
+ end
21
+ end
22
+
@@ -0,0 +1,49 @@
1
+ module Bio
2
+ module BioAlignment
3
+
4
+ # Function for marking rows (sequences), when a row block returns the new
5
+ # state, and returning a newly cloned alignment
6
+ module MarkRows
7
+
8
+ # Mark each seq
9
+ def mark_rows &block
10
+ aln = markrows_clone
11
+ aln.rows.each do | row |
12
+ row.state = block.call(row.state,row)
13
+ end
14
+ aln
15
+ end
16
+
17
+ # allow the marking of elements in a copied alignment, making sure
18
+ # each element is a proper Element object that can contain state.
19
+ # A Sequence alignment will be turned into an Elements alignment.
20
+ def mark_row_elements &block
21
+ aln = markrows_clone
22
+ aln.rows.each_with_index do | row,rownum |
23
+ new_seq = block.call(row.to_elements,rownum)
24
+ aln.rows[rownum] = new_seq
25
+ end
26
+ aln
27
+ end
28
+
29
+ protected
30
+
31
+ def markrows_clone
32
+ aln = self.clone
33
+ # clone row state, or add a state object
34
+ aln.rows.each do | row |
35
+ new_state =
36
+ if row.state
37
+ row.state.clone
38
+ else
39
+ RowState.new
40
+ end
41
+ row.state = new_state
42
+ end
43
+ aln
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+
@@ -0,0 +1,115 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module MaskIslands
7
+ include MarkRows
8
+
9
+ class IslandElementState < ElementMaskedState
10
+ attr_accessor :unique
11
+ def to_s
12
+ super + (@unique?'U':' ')
13
+ end
14
+ end
15
+
16
+ # Drop all 'islands' in a sequence with low consensus, that show a gap
17
+ # larger than 'min_gap_size' (default 3) on both sides, and are shorter
18
+ # than 'max_island_size' (default 30). An island larger than 30 elements
19
+ # is arguably no longer an island, and low consensus stretches may be
20
+ # loops - it is up to the alignment procedure to get that right. We also
21
+ # allow for micro deletions inside an alignment (1 or 2 elements).
22
+ # The island consensus is calculated by column. If more than 50% of the
23
+ # island shows consensus, the island is retained. Consensus for each
24
+ # element is defined as the number of matches in the column (default 1).
25
+ def mark_islands
26
+ mark_row_elements { |row,rownum|
27
+ # first set state and find unique elements (i.e. consensus)
28
+ row.each_with_index do |e,colnum|
29
+ e.state = IslandElementState.new
30
+ column = columns[colnum]
31
+ e.state.unique = (column.count{|e2| !e2.gap? and e2 == e } == 1)
32
+ # p [e,e.state,e.state.unique]
33
+ end
34
+ # group elements into islands (split on gap) and mask
35
+ gap = []
36
+ island = []
37
+ in_island = true
38
+ row.each do |e|
39
+ if not in_island
40
+ if e.gap?
41
+ gap << e
42
+ else
43
+ island << e
44
+ in_island = true
45
+ gap = []
46
+ end
47
+ else # in_island
48
+ if not e.gap?
49
+ island << e
50
+ gap = []
51
+ else
52
+ gap << e
53
+ if gap.length > 2
54
+ in_island = false
55
+ mark_island(island)
56
+ # print_island(island)
57
+ island = []
58
+ end
59
+ end
60
+ end
61
+ end
62
+ if in_island
63
+ mark_island(island)
64
+ # print_island(island) if island.length > 0
65
+ end
66
+ # row.each_with_index do |e,colnum|
67
+ # e.state = ElementState.new
68
+ # column = columns[colnum]
69
+ # e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
70
+ # # print e,',',e.state,';'
71
+ # end
72
+ # now make sure there are at least 5 in a row, otherwise
73
+ # start unmasking. First group all elements
74
+ # group = []
75
+ # row.each_with_index do |e,colnum|
76
+ # next if e.gap?
77
+ # if e.state.masked?
78
+ # group << e
79
+ # else
80
+ # if group.length <= min_serial
81
+ # # the group is too small
82
+ # group.each do | e2 |
83
+ # e2.state.unmask!
84
+ # end
85
+ # end
86
+ # group = []
87
+ # end
88
+ # end
89
+ row # return changed sequence
90
+ }
91
+ end
92
+
93
+ private
94
+
95
+ def mark_island island
96
+ return if island.length < 2
97
+ unique = 0
98
+ island.each do |e|
99
+ unique += 1 if e.state.unique == true
100
+ end
101
+ consensus = 1.0 - unique.to_f / island.length
102
+ # p unique, consensus
103
+ if consensus < 0.5
104
+ island.each do |e|
105
+ e.state.mask!
106
+ end
107
+ end
108
+ end
109
+
110
+ def print_island island
111
+ p island.map {|e2| e2.to_s + ':' + e2.state.to_s }.join(",")
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,44 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module MaskSerialMutations
7
+ include MarkRows
8
+
9
+ # edit copied alignment and mark elements if they are a continuous of
10
+ # unique mutations in the alignment. The default is at least 5 mutations
11
+ # in a row.
12
+ def mark_serial_mutations min_serial=5
13
+ mark_row_elements { |row,rownum|
14
+ # if an element is unique, mask it
15
+ row.each_with_index do |e,colnum|
16
+ e.state = ElementMaskedState.new
17
+ column = columns[colnum]
18
+ e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
19
+ # print e,',',e.state,';'
20
+ end
21
+ # now make sure there are at least 5 in a row, otherwise
22
+ # start unmasking. First group all elements
23
+ group = []
24
+ row.each_with_index do |e,colnum|
25
+ next if e.gap?
26
+ if e.state.masked?
27
+ group << e
28
+ else
29
+ if group.length <= min_serial
30
+ # the group is too small
31
+ group.each do | e2 |
32
+ e2.state.unmask!
33
+ end
34
+ end
35
+ group = []
36
+ end
37
+ end
38
+ row # return changed sequence
39
+ }
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,86 @@
1
+
2
+ module Bio
3
+ module BioAlignment
4
+
5
+ # Simple element that can be queried
6
+ class Element
7
+ GAP = '-'
8
+ UNDEFINED = 'X'
9
+ include State
10
+
11
+ def initialize c
12
+ @c = c
13
+ end
14
+ def gap?
15
+ @c == GAP
16
+ end
17
+ def undefined?
18
+ @c == 'X'
19
+ end
20
+ def to_s
21
+ @c
22
+ end
23
+ def == other
24
+ to_s == other.to_s
25
+ end
26
+ end
27
+
28
+ # Elements is a container for Element sequences.
29
+ #
30
+ class Elements
31
+ include Enumerable
32
+ include State
33
+
34
+ attr_reader :id, :seq
35
+ def initialize id, seq
36
+ @id = id
37
+ @seq = []
38
+ if seq.kind_of?(Elements)
39
+ @seq = seq.clone
40
+ elsif seq.kind_of?(String)
41
+ seq.each_char do |c|
42
+ @seq << Element.new(c)
43
+ end
44
+ else
45
+ seq.each do |s|
46
+ @seq << Element.new(s)
47
+ end
48
+ end
49
+ end
50
+
51
+ def [] index
52
+ @seq[index]
53
+ end
54
+
55
+ def length
56
+ @seq.length
57
+ end
58
+
59
+ def each
60
+ @seq.each { |e| yield e }
61
+ end
62
+
63
+ def to_s
64
+ @seq.map{|e| e.to_s }.join("")
65
+ end
66
+
67
+ def << element
68
+ @seq << element
69
+ end
70
+
71
+ def empty_copy
72
+ Elements.new(@id,"")
73
+ end
74
+
75
+ def clone
76
+ copy = Elements.new(@id,"")
77
+ @seq.each do |e|
78
+ copy << e.dup
79
+ end
80
+ copy
81
+ end
82
+
83
+ end
84
+ end
85
+
86
+ end