bio-alignment 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/Gemfile +5 -4
  2. data/README.md +94 -9
  3. data/Rakefile +2 -1
  4. data/VERSION +1 -1
  5. data/doc/bio-alignment-design.md +75 -11
  6. data/features/bioruby-feature.rb +17 -0
  7. data/features/bioruby.feature +6 -1
  8. data/features/columns-feature.rb +2 -0
  9. data/features/edit/del_bridges-feature.rb +7 -3
  10. data/features/edit/del_bridges.feature +1 -2
  11. data/features/edit/del_non_informative_sequences-feature.rb +26 -0
  12. data/features/edit/del_non_informative_sequences.feature +19 -0
  13. data/features/edit/del_short_sequences-feature.rb +21 -0
  14. data/features/edit/del_short_sequences.feature +25 -0
  15. data/features/edit/gblocks-feature.rb +2 -2
  16. data/features/edit/mask_islands-feature.rb +17 -4
  17. data/features/edit/mask_islands.feature +28 -17
  18. data/features/edit/mask_serial_mutations-feature.rb +8 -6
  19. data/features/edit/mask_serial_mutations.feature +11 -11
  20. data/features/tree-feature.rb +66 -0
  21. data/features/tree.feature +45 -0
  22. data/lib/bio-alignment.rb +4 -1
  23. data/lib/bio-alignment/alignment.rb +58 -3
  24. data/lib/bio-alignment/codonsequence.rb +14 -2
  25. data/lib/bio-alignment/columns.rb +102 -0
  26. data/lib/bio-alignment/edit/del_bridges.rb +18 -1
  27. data/lib/bio-alignment/edit/del_non_informative_sequences.rb +27 -0
  28. data/lib/bio-alignment/edit/del_short_sequences.rb +28 -0
  29. data/lib/bio-alignment/edit/edit_columns.rb +22 -0
  30. data/lib/bio-alignment/edit/edit_rows.rb +49 -0
  31. data/lib/bio-alignment/edit/mask_islands.rb +115 -0
  32. data/lib/bio-alignment/edit/mask_serial_mutations.rb +44 -0
  33. data/lib/bio-alignment/elements.rb +86 -0
  34. data/lib/bio-alignment/rows.rb +52 -0
  35. data/lib/bio-alignment/sequence.rb +20 -14
  36. data/lib/bio-alignment/state.rb +64 -8
  37. data/lib/bio-alignment/tree.rb +77 -0
  38. data/spec/bio-alignment_spec.rb +57 -1
  39. data/spec/spec_helper.rb +3 -3
  40. metadata +47 -22
  41. data/lib/bio-alignment/column.rb +0 -47
@@ -0,0 +1,102 @@
1
+ require 'bio-alignment/state'
2
+
3
+ module Bio
4
+
5
+ module BioAlignment
6
+
7
+ # The Columns module provides accessors for the column list
8
+ # returning Column objects
9
+ module Columns
10
+
11
+ # Return a list of Column objects. The contents of the
12
+ # columns are accessed lazily
13
+ def columns
14
+ @columns ||= (0..num_columns-1).map { | col | Column.new(self,col) }
15
+ end
16
+
17
+ # def columns= list
18
+ # @columns = list
19
+ # end
20
+
21
+ def num_columns
22
+ rows.first.length
23
+ end
24
+
25
+ # Return an alignment which match columns. The originating
26
+ # sequences should have methods 'empty_copy' and '<<'
27
+ def columns_where &block
28
+ seqs = []
29
+ rows.each do | seq |
30
+ new_seq = seq.empty_copy
31
+ seq.each_with_index do | e,i |
32
+ new_seq << e if block.call(columns[i])
33
+ end
34
+ seqs << new_seq
35
+ end
36
+ Alignment.new(seqs)
37
+ end
38
+
39
+ def columns_to_s
40
+ columns.map { |c| (c.state ? c.state.to_s : '?') }.join
41
+ end
42
+
43
+ def clone_columns!
44
+ # clone the columns
45
+ old_columns = @columns
46
+ @columns = []
47
+ old_columns.each do | old_column |
48
+ @columns << old_column.clone
49
+ end
50
+ end
51
+ end
52
+
53
+ # Support the notion of columns in an alignment. A column
54
+ # can have state by attaching state objects
55
+ class Column
56
+ include State
57
+ include Enumerable
58
+
59
+ def initialize aln, col
60
+ @aln = aln
61
+ @col = col
62
+ end
63
+
64
+ def [] index
65
+ @aln[index][@col]
66
+ end
67
+
68
+ # iterator fetches a column on demand, yielding column elements
69
+ def each
70
+ @aln.each do | seq |
71
+ yield seq[@col]
72
+ end
73
+ end
74
+
75
+ def length
76
+ @length ||= @aln.rows.size
77
+ end
78
+
79
+ def count &block
80
+ counter = 0
81
+ each do | e |
82
+ found =
83
+ if e.kind_of?(String)
84
+ block.call(Element.new(e))
85
+ else
86
+ block.call(e)
87
+ end
88
+ counter += 1 if found
89
+ end
90
+ counter
91
+ end
92
+
93
+ def to_s
94
+ map{|e| e.to_s}.join('')
95
+ end
96
+
97
+ end
98
+
99
+ end
100
+
101
+ end
102
+
@@ -1,10 +1,27 @@
1
+ require 'bio-alignment/edit/edit_columns'
1
2
 
2
3
  module Bio
3
4
  module BioAlignment
4
5
 
5
6
  module DelBridges
7
+ include MarkColumns
6
8
 
7
- def clean
9
+ # Return a new alignment with columns marked for deletion, i.e. mark
10
+ # columns that mostly contain gaps (threshold +percentage+). The
11
+ # alignment returned is a cloned copy
12
+ def mark_bridges percentage = 30
13
+ mark_columns { |state,column|
14
+ num = column.count { |e| e.gap? or e.undefined? }
15
+ if (num.to_f/column.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ # Return an alignment with the bridges removed
23
+ def del_bridges percentage=30
24
+ mark_bridges.columns_where { |col| !col.state.deleted? }
8
25
  end
9
26
  end
10
27
  end
@@ -0,0 +1,27 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module DelNonInformativeSequences
7
+ include MarkRows
8
+
9
+ # Return a new alignment with rows marked for deletion, i.e. mark rows
10
+ # that mostly contain undefined elements and gaps (threshold
11
+ # +percentage+). The alignment returned is a cloned copy
12
+ def mark_non_informative_sequences percentage = 30
13
+ mark_rows { |state,row|
14
+ num = row.count { |e| e.gap? or e.undefined? }
15
+ if (num.to_f/row.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ def del_non_informative_sequences percentage=30
23
+ mark_non_informative_sequences.rows_where { |row| !row.state.deleted? }
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module DelShortSequences
7
+ include MarkRows
8
+
9
+ # Return a new alignment with rows marked for deletion, i.e. mark
10
+ # rows that mostly contain gaps (threshold +percentage+). The
11
+ # alignment returned is a cloned copy
12
+ def mark_short_sequences percentage = 30
13
+ mark_rows { |state,row|
14
+ num = row.count { |e| e.gap? }
15
+ if (num.to_f/row.length) > 1.0-percentage/100.0
16
+ state.delete!
17
+ end
18
+ state
19
+ }
20
+ end
21
+
22
+ # Return an alignment with the bridges removed
23
+ def del_short_sequences percentage=30
24
+ mark_short_sequences.rows_where { |row| !row.state.deleted? }
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ module Bio
2
+ module BioAlignment
3
+
4
+ module MarkColumns
5
+ def mark_columns &block
6
+ aln = self.clone
7
+ # clone column state
8
+ aln.columns.each do | column |
9
+ new_state =
10
+ if column.state
11
+ column.state.clone
12
+ else
13
+ ColumnState.new
14
+ end
15
+ column.state = block.call(new_state,column)
16
+ end
17
+ aln
18
+ end
19
+ end
20
+ end
21
+ end
22
+
@@ -0,0 +1,49 @@
1
+ module Bio
2
+ module BioAlignment
3
+
4
+ # Function for marking rows (sequences), when a row block returns the new
5
+ # state, and returning a newly cloned alignment
6
+ module MarkRows
7
+
8
+ # Mark each seq
9
+ def mark_rows &block
10
+ aln = markrows_clone
11
+ aln.rows.each do | row |
12
+ row.state = block.call(row.state,row)
13
+ end
14
+ aln
15
+ end
16
+
17
+ # allow the marking of elements in a copied alignment, making sure
18
+ # each element is a proper Element object that can contain state.
19
+ # A Sequence alignment will be turned into an Elements alignment.
20
+ def mark_row_elements &block
21
+ aln = markrows_clone
22
+ aln.rows.each_with_index do | row,rownum |
23
+ new_seq = block.call(row.to_elements,rownum)
24
+ aln.rows[rownum] = new_seq
25
+ end
26
+ aln
27
+ end
28
+
29
+ protected
30
+
31
+ def markrows_clone
32
+ aln = self.clone
33
+ # clone row state, or add a state object
34
+ aln.rows.each do | row |
35
+ new_state =
36
+ if row.state
37
+ row.state.clone
38
+ else
39
+ RowState.new
40
+ end
41
+ row.state = new_state
42
+ end
43
+ aln
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+
@@ -0,0 +1,115 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module MaskIslands
7
+ include MarkRows
8
+
9
+ class IslandElementState < ElementMaskedState
10
+ attr_accessor :unique
11
+ def to_s
12
+ super + (@unique?'U':' ')
13
+ end
14
+ end
15
+
16
+ # Drop all 'islands' in a sequence with low consensus, that show a gap
17
+ # larger than 'min_gap_size' (default 3) on both sides, and are shorter
18
+ # than 'max_island_size' (default 30). An island larger than 30 elements
19
+ # is arguably no longer an island, and low consensus stretches may be
20
+ # loops - it is up to the alignment procedure to get that right. We also
21
+ # allow for micro deletions inside an alignment (1 or 2 elements).
22
+ # The island consensus is calculated by column. If more than 50% of the
23
+ # island shows consensus, the island is retained. Consensus for each
24
+ # element is defined as the number of matches in the column (default 1).
25
+ def mark_islands
26
+ mark_row_elements { |row,rownum|
27
+ # first set state and find unique elements (i.e. consensus)
28
+ row.each_with_index do |e,colnum|
29
+ e.state = IslandElementState.new
30
+ column = columns[colnum]
31
+ e.state.unique = (column.count{|e2| !e2.gap? and e2 == e } == 1)
32
+ # p [e,e.state,e.state.unique]
33
+ end
34
+ # group elements into islands (split on gap) and mask
35
+ gap = []
36
+ island = []
37
+ in_island = true
38
+ row.each do |e|
39
+ if not in_island
40
+ if e.gap?
41
+ gap << e
42
+ else
43
+ island << e
44
+ in_island = true
45
+ gap = []
46
+ end
47
+ else # in_island
48
+ if not e.gap?
49
+ island << e
50
+ gap = []
51
+ else
52
+ gap << e
53
+ if gap.length > 2
54
+ in_island = false
55
+ mark_island(island)
56
+ # print_island(island)
57
+ island = []
58
+ end
59
+ end
60
+ end
61
+ end
62
+ if in_island
63
+ mark_island(island)
64
+ # print_island(island) if island.length > 0
65
+ end
66
+ # row.each_with_index do |e,colnum|
67
+ # e.state = ElementState.new
68
+ # column = columns[colnum]
69
+ # e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
70
+ # # print e,',',e.state,';'
71
+ # end
72
+ # now make sure there are at least 5 in a row, otherwise
73
+ # start unmasking. First group all elements
74
+ # group = []
75
+ # row.each_with_index do |e,colnum|
76
+ # next if e.gap?
77
+ # if e.state.masked?
78
+ # group << e
79
+ # else
80
+ # if group.length <= min_serial
81
+ # # the group is too small
82
+ # group.each do | e2 |
83
+ # e2.state.unmask!
84
+ # end
85
+ # end
86
+ # group = []
87
+ # end
88
+ # end
89
+ row # return changed sequence
90
+ }
91
+ end
92
+
93
+ private
94
+
95
+ def mark_island island
96
+ return if island.length < 2
97
+ unique = 0
98
+ island.each do |e|
99
+ unique += 1 if e.state.unique == true
100
+ end
101
+ consensus = 1.0 - unique.to_f / island.length
102
+ # p unique, consensus
103
+ if consensus < 0.5
104
+ island.each do |e|
105
+ e.state.mask!
106
+ end
107
+ end
108
+ end
109
+
110
+ def print_island island
111
+ p island.map {|e2| e2.to_s + ':' + e2.state.to_s }.join(",")
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,44 @@
1
+ require 'bio-alignment/edit/edit_rows'
2
+
3
+ module Bio
4
+ module BioAlignment
5
+
6
+ module MaskSerialMutations
7
+ include MarkRows
8
+
9
+ # edit copied alignment and mark elements if they are a continuous of
10
+ # unique mutations in the alignment. The default is at least 5 mutations
11
+ # in a row.
12
+ def mark_serial_mutations min_serial=5
13
+ mark_row_elements { |row,rownum|
14
+ # if an element is unique, mask it
15
+ row.each_with_index do |e,colnum|
16
+ e.state = ElementMaskedState.new
17
+ column = columns[colnum]
18
+ e.state.mask! if column.count{|e2| !e2.gap? and e2 == e } == 1
19
+ # print e,',',e.state,';'
20
+ end
21
+ # now make sure there are at least 5 in a row, otherwise
22
+ # start unmasking. First group all elements
23
+ group = []
24
+ row.each_with_index do |e,colnum|
25
+ next if e.gap?
26
+ if e.state.masked?
27
+ group << e
28
+ else
29
+ if group.length <= min_serial
30
+ # the group is too small
31
+ group.each do | e2 |
32
+ e2.state.unmask!
33
+ end
34
+ end
35
+ group = []
36
+ end
37
+ end
38
+ row # return changed sequence
39
+ }
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,86 @@
1
+
2
+ module Bio
3
+ module BioAlignment
4
+
5
+ # Simple element that can be queried
6
+ class Element
7
+ GAP = '-'
8
+ UNDEFINED = 'X'
9
+ include State
10
+
11
+ def initialize c
12
+ @c = c
13
+ end
14
+ def gap?
15
+ @c == GAP
16
+ end
17
+ def undefined?
18
+ @c == 'X'
19
+ end
20
+ def to_s
21
+ @c
22
+ end
23
+ def == other
24
+ to_s == other.to_s
25
+ end
26
+ end
27
+
28
+ # Elements is a container for Element sequences.
29
+ #
30
+ class Elements
31
+ include Enumerable
32
+ include State
33
+
34
+ attr_reader :id, :seq
35
+ def initialize id, seq
36
+ @id = id
37
+ @seq = []
38
+ if seq.kind_of?(Elements)
39
+ @seq = seq.clone
40
+ elsif seq.kind_of?(String)
41
+ seq.each_char do |c|
42
+ @seq << Element.new(c)
43
+ end
44
+ else
45
+ seq.each do |s|
46
+ @seq << Element.new(s)
47
+ end
48
+ end
49
+ end
50
+
51
+ def [] index
52
+ @seq[index]
53
+ end
54
+
55
+ def length
56
+ @seq.length
57
+ end
58
+
59
+ def each
60
+ @seq.each { |e| yield e }
61
+ end
62
+
63
+ def to_s
64
+ @seq.map{|e| e.to_s }.join("")
65
+ end
66
+
67
+ def << element
68
+ @seq << element
69
+ end
70
+
71
+ def empty_copy
72
+ Elements.new(@id,"")
73
+ end
74
+
75
+ def clone
76
+ copy = Elements.new(@id,"")
77
+ @seq.each do |e|
78
+ copy << e.dup
79
+ end
80
+ copy
81
+ end
82
+
83
+ end
84
+ end
85
+
86
+ end