bio-assembly 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+
2
+ require 'bio/sequence'
3
+ require 'bio-assembly/contig'
4
+ require 'bio-assembly/read'
5
+
6
+ module Bio
7
+
8
+ class Assembly
9
+ attr_accessor :contigs
10
+
11
+ def initialize(path)
12
+ @file = File.new(path, 'r')
13
+ @contigs = Array.new
14
+ parse_as
15
+ end
16
+
17
+ def contigs
18
+ # use each_contig to stream large files
19
+ parse_whole_file if @contigs.empty?
20
+ @contigs
21
+ end
22
+
23
+ def each_contig
24
+ # check if file is already parsed
25
+ if @total_num_contigs.to_i == @contigs.size
26
+ @contigs.each{ |contig| yield contig }
27
+ else
28
+ each_identifier do |identifier, attrs|
29
+ next unless identifier == 'CO'
30
+ contig = parse_contig(attrs)
31
+ @contigs.push contig
32
+ yield(contig)
33
+ end
34
+ end
35
+ end
36
+
37
+ def to_ace
38
+ ace = ""
39
+ ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
40
+ each_contig { |contig| ace += contig.to_ace + "\n" }
41
+ ace
42
+ end
43
+
44
+ private
45
+
46
+ def parse_contig(attrs)
47
+ contig = Bio::Assembly::Contig.new
48
+ contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
49
+ # keep track of the number of RD identifiers parsed
50
+ @num_rds_parsed = 0
51
+
52
+ # get sequence
53
+ seq = @file.gets("\n\n").tr(" \r\n", "")
54
+ contig.seq = seq
55
+
56
+ # loop through identifiers (e.g AF, RD, etc)
57
+ each_identifier do |identifier, attrs|
58
+ case identifier
59
+ when "BQ" then parse_bq(contig)
60
+ when "AF" then parse_af(contig, attrs)
61
+ when "BS" then parse_bs(contig, attrs)
62
+ when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
63
+ when "WR" then parse_wr(contig, attrs)
64
+ when "RT" then parse_rt(contig, attrs)
65
+ when "CT" then parse_ct(contig, attrs)
66
+ when "WA" then parse_wa(contig, attrs)
67
+ end
68
+ end
69
+
70
+ contig
71
+ end
72
+
73
+ # Finds the next_identifier
74
+ def each_identifier
75
+ @file.each do |line|
76
+ next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
77
+ yield(line[0..1], line[3..-1])
78
+ end
79
+ end
80
+
81
+ # parse assembly meta data
82
+ def parse_as
83
+ line = @file.gets
84
+ identifier, @total_num_contigs, total_num_reads = line.split(" ")
85
+ end
86
+
87
+ # parse contig sequence quality data
88
+ def parse_bq(contig)
89
+ contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
90
+ end
91
+
92
+ # parse read meta data
93
+ def parse_af(contig, attrs)
94
+ read = Bio::Assembly::Read.new
95
+ read.name , read.orientation, read.from = attrs.split(" ")
96
+ contig.add_read read
97
+ end
98
+
99
+ # parse base sequence data
100
+ def parse_bs(contig, attrs)
101
+ from, to, read_name = attrs.split(" ")
102
+ read = contig.find_read_by_name( read_name )
103
+ read.add_base_sequence(from, to, read_name)
104
+ end
105
+
106
+ # parse read sequence and position data
107
+ def parse_rd(contig, attrs)
108
+ # increment counter
109
+ @num_rds_parsed += 1
110
+
111
+ # parse read
112
+ read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
113
+ seq = @file.gets("\n\n").tr( " \r\n", "")
114
+
115
+ # get read with matching name
116
+ read = contig.find_read_by_name( read_name )
117
+ read.seq = seq
118
+ read.to = read.from.to_i + read.seq.length
119
+ # set read.to to contig length if read runs off contig
120
+ read.to = contig.seq.length if read.to > contig.seq.length
121
+
122
+ # if present parse QA and DS associated with this read
123
+ each_identifier do |identifier, attrs|
124
+ case identifier
125
+ when "QA" then parse_qa(read, attrs)
126
+ when "DS" then parse_ds(read, attrs); break
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ # parse a read's clear ranges (the part of the read that contributes to the contig)
133
+ def parse_qa(read, attrs)
134
+ start, stop, clear_range_from, clear_range_to = attrs.split(" ")
135
+ read.clear_range_from = clear_range_from
136
+ read.clear_range_to = clear_range_to
137
+ end
138
+
139
+ # parse file data - ignored
140
+ def parse_ds(read, attrs)
141
+ end
142
+
143
+ # parse run meta data - ignored
144
+ def parse_wa(contig, attrs)
145
+ end
146
+
147
+ # parse run meta data - ignored
148
+ def parse_ct(contig, attrs)
149
+ end
150
+
151
+ def num_contigs
152
+ contigs.size
153
+ end
154
+
155
+ def num_reads
156
+ read_num = 0
157
+ each_contig { |contig| read_num += contig.num_reads }
158
+ read_num
159
+ end
160
+
161
+ def parse_whole_file
162
+ each_contig { |x| 1 }
163
+ end
164
+
165
+ end
166
+
167
+ end
168
+
169
+
@@ -0,0 +1,97 @@
1
+ module Bio
2
+ class Assembly
3
+
4
+ class Contig
5
+ attr_accessor :seq, :orientation, :quality, :to, :from, :name, :reads
6
+ alias consensus_seq seq
7
+
8
+ def initialize(str="")
9
+ @reads = Hash.new
10
+ @seq = Bio::Sequence::NA.new(str)
11
+ # counter for RD identifier
12
+ @rds_parsed = 0
13
+ end
14
+
15
+ def find_read_by_name(name)
16
+ @reads[name]
17
+ end
18
+
19
+ def find_reads_in_range(clear_range_from, clear_range_to)
20
+ reads_in_range = Array.new
21
+ each_read do |read|
22
+
23
+ # Read starts in region
24
+ if read.from+read.clear_range_from > clear_range_from and read.from+read.clear_range_from < clear_range_to
25
+ reads_in_range.push read
26
+ # Read ends in region
27
+ elsif read.to+read.clear_range_to < clear_range_to and read.to+read.clear_range_to > clear_range_from
28
+ reads_in_range.push read
29
+ # Read encompasses region
30
+ elsif read.from+read.clear_range_from < clear_range_from and read.to+read.clear_range_to > clear_range_to
31
+ reads_in_range.push read
32
+ end
33
+
34
+ end
35
+ reads_in_range;
36
+ end
37
+
38
+ def add_read(read)
39
+ # TODO do some checks for pos location
40
+ @reads[read.name] = read
41
+ end
42
+
43
+ def each_read
44
+ @reads.each_value { |read| yield read }
45
+ end
46
+
47
+ def num_reads
48
+ @reads.size
49
+ end
50
+
51
+ def num_bases
52
+ seq.length
53
+ end
54
+
55
+ def num_base_segments
56
+ num_base_sequences = 0
57
+ each_read do |read|
58
+ num_base_sequences += read.base_sequences.size unless read.base_sequences.nil?
59
+ end
60
+ num_base_sequences
61
+ end
62
+
63
+ def to_ace
64
+ ace = ""
65
+ ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
66
+ ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
67
+ ace += "BQ\n"
68
+ last_stop = quality.size - 1
69
+ (quality.size/50+1).times do |i|
70
+ start = i * 50
71
+ stop = (i+1) * 50 - 1
72
+ stop = last_stop if stop > last_stop
73
+ ace += ' ' + quality[start..stop].join(' ') + "\n"
74
+ end
75
+ ace += "\n"
76
+
77
+ # holds BS data for reads
78
+ bs_str = ""
79
+ # holds RD, QA, and DS data for reads
80
+ rest_str = ""
81
+ @reads.values.sort.each do |read|
82
+ ace += read.to_ace_af
83
+ bs_str += read.to_ace_bs
84
+ rest_str += read.to_ace_rest
85
+ end
86
+
87
+ # compile data in correct order
88
+ ace += bs_str
89
+ ace += "\n"
90
+ ace += rest_str
91
+ ace
92
+ end
93
+
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,93 @@
1
+
2
+ require 'bio-assembly/read/ace'
3
+
4
+ module Bio
5
+ class Assembly
6
+ class Read
7
+ include Bio::Assembly::Read::Ace
8
+
9
+ attr_accessor :seq, :name, :orientation, :from, :to, :clear_range_from, :clear_range_to
10
+ def initialize(str="")
11
+ @seq = Bio::Sequence::NA.new(str)
12
+ end
13
+
14
+ def ==(other_read)
15
+ name == other_read.name
16
+ end
17
+
18
+ def num_bases
19
+ seq.length
20
+ end
21
+
22
+ def from=(new_from)
23
+ @from = new_from.to_i
24
+ end
25
+
26
+ def to=(new_to)
27
+ @to = new_to.to_i
28
+ end
29
+
30
+ def clear_range_from=(new_clear_range_from)
31
+ @clear_range_from = new_clear_range_from.to_i
32
+ end
33
+
34
+ def clear_range_to=(new_clear_range_to)
35
+ @clear_range_to = new_clear_range_to.to_i
36
+ end
37
+
38
+ def to_ace
39
+ ace += ""
40
+ # holds BS data for reads
41
+ bs_str = ""
42
+ # holds RD, QA, and DS data for reads
43
+ rest_str = ""
44
+ ace += to_ace_af
45
+ bs_str += to_ace_bs
46
+ rest_str = to_ace_rest
47
+
48
+ # compile data in correct order
49
+ ace += bs_str
50
+ ace += "\n"
51
+ ace += rest_str
52
+ ace
53
+ end
54
+
55
+ def <=>(other)
56
+ unless other.kind_of?(Bio::Assembly::Read)
57
+ raise "[Error] markers are not comparable"
58
+ end
59
+ if self.from == other.from
60
+ # sort by to if froms are identical
61
+ return self.to.<=>(other.to)
62
+ else
63
+ return self.from.<=>(other.from)
64
+ end
65
+ end
66
+
67
+ def to_ace_bs
68
+ bs_str = ""
69
+ unless base_sequences.nil?
70
+ base_sequences.each do |bs|
71
+ bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
72
+ end
73
+ end
74
+ bs_str
75
+ end
76
+
77
+ def to_ace_af
78
+ ['AF', name, orientation, from].join(' ') + "\n"
79
+ end
80
+
81
+ def to_ace_rest
82
+ rest_str = ""
83
+ rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
84
+ rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
85
+ rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
86
+ rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
87
+ rest_str
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,39 @@
1
+ module Bio
2
+ class Assembly
3
+ class Read
4
+
5
+ module Ace
6
+ attr_accessor :base_sequences
7
+
8
+ def add_base_sequence(from, to, read_name)
9
+ @base_sequences = Array.new if @base_sequences.nil?
10
+ @base_sequences.push BaseSequence.new(from, to, read_name)
11
+ end
12
+
13
+ class BaseSequence
14
+ attr_accessor :from, :to, :read_name
15
+
16
+ def initialize(from, to, read_name)
17
+ @from = from
18
+ @to = to
19
+ @read_name = read_name
20
+ end
21
+
22
+ def <=>(other)
23
+ unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
24
+ raise "[Error] markers are not comparable"
25
+ end
26
+ if self.from == other.from
27
+ # sort by to if froms are identical
28
+ return self.to.<=>(other.to)
29
+ else
30
+ return self.from.<=>(other.from)
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-assembly'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,78 @@
1
+ require 'helper'
2
+
3
+ class TestBioAssembly < Test::Unit::TestCase
4
+
5
+ def setup
6
+ ace_filename = File.join('data', 'example1.ace')
7
+ @obj = Bio::Assembly.new(ace_filename)
8
+
9
+ # pick a contig to do in depth tests on
10
+ @contig = nil
11
+ @obj.each_contig { |c| @contig = c if c.name.to_i == 5 }
12
+
13
+ # pick a read to do in depth tests on
14
+ @read = nil
15
+ @contig.each_read{ |r| @read = r if r.name == '235283518' }
16
+ end
17
+
18
+ def test_num_contigs_parsed
19
+ contigs_parsed = 13
20
+ assert_equal(contigs_parsed, @obj.contigs.size)
21
+ end
22
+
23
+ def test_num_reads_parsed
24
+ reads_parsed_known = 1760
25
+ reads_parsed = 0
26
+ @obj.contigs.each { |c| reads_parsed += c.reads.size }
27
+ assert_equal(reads_parsed_known, reads_parsed)
28
+ end
29
+
30
+ def test_contig_num_reads
31
+ num_reads = 15
32
+ assert_equal(num_reads, @contig.reads.size )
33
+ end
34
+
35
+ def test_contig_seq
36
+ seq = "TTTCCGTCAGATGTAAAGGTTGCAGAACCGGACCATTCTTGCGTCTGATCTTTCAGGATCGGATCGTTGGCGTCGAACTTATCGCTGTCTTTAAAGACACGGCCCGCGTTTTTCCAGCTGTCGATTGAGTTGTCGCCGACCTTTTGATAAAACATGTAGATTGATGTGTCATCAGCGTCTTTCGGGCTTCCCGCAAGAGCAAACACAACGTGATAGCCGTTGTATTCAGCTACTGTTCCGTCAGCGTTTTGCAGCGGCCAGCTGTCCCACACATCAAGTCCTTTTGCAGACTCAATATTTTTAATCGTTGATTGATCGAATTGAGGCACTTGGTATTTTTCGTTTTGCTGCTGTTTAGGGATCTGCAGCATATCATGGCGTGTAATATGAGAGACGCCGTACGTTTCTTTGTATGCTTTTTGGTTATTTTCTTTCGCGAAGGCTTGAGTCGCTCCTCCTGCCAGAAGTGCAGTCGTAAAAGTCAGAACTGTGGCTTGTTTTACAATTTTTTTGATGTTCATGTTCATGTCTCCTTCTGTATGTACTGTTTTTTGCGATCTGCCGTTTCGATCCTCCCGAATTGACTAGTGGGTAGGCCTGGCGGCCGCCTGGCCGTCGACATTTAGGTGACACTATAGAAGGATCCGCGGAATTCCTTTTTAGATTGAGATAATGACTTTGTTTGGAAGGATGTA*TTTTCATTTAATTAAAGCAAATTCGTAATAAT*AAAGTTAAACAATTTAATTTCAAGATGATTCACAGGTTTGTTGCCTCAAAAGAAAACTTATATTAATGGCAAGTTGTGAATAATTTATGCAACTCTTGTGGACAAGTTGACTCAACTTTTCAC*TTTATGTTATATTGTAAGGATGTGACTTTGTTTTGGAAAATTATATTTAATTTGATAATTAACCAATATAAAAAAGATAAACCAAAAGCTATAAGTCGTAAATAAGGACATTGGAAACAAGAAATATTCTCTCCTGAACATTATTTTAAATTATGCGCAATATGCAAATTTATAAGTGTTAAGTTAAAAAGATTGTTAATGGTTCTGTTTATTACCCAAAGACTTTTTTAAAGTTTAAGTCGTTGCTAAGAGTGCAGCGTTTAGACAAATAAAAATGCAATAATCTTCTCGCTCGGGAGCTATGTCCCTCGCATAATATTCTTCAAAGTGTACAGTAAATATTCTAGAAAAGTGAAGTGTGAAAAAGATATATTGCTTGTTTTTATATTTTGTTAATACAACAAAACTTCAAAAACCTGCGGTGGGGGGGGGGGGATAGTCACTTCCGTCACCTTCACCCCTCTCGTTCACTATACTCCCTCGCCCTGGCGTAATGATGGGGGGATTGGGGGTAGTTGCCCCTTAATAAAGTTCAAACTTGATTTATTTCTAACTCGATACCAGTGATTTACAAATGTTTCTGAAATGGCATGGTTTTCCCTAATAAATGCCTAAAAACCCTGAGCTGAGCCCACGCCAATT"
37
+ assert_equal(seq, @contig.seq.to_s)
38
+ end
39
+
40
+ def test_read_seq
41
+ read_seq = 'GAAAAAAAAAGGCAGAAGTTTAATCAAAACGGATTTTTCCGTCAGATGTAAAGGTTGCAGAACCGGACCATTCTTGCGTCTGATCTTTCAGGATCGGATCGTTGGCGTCGAACTTATCGCTGTCTTTAAAGACACGGCCCGCGTTTTTCCAGCTGTCGATTGAGTTGTCGCCGACCTTTTGATAAAACATGTAGATTGATGTGTCATCAGCGTCTTTCGGGCTTCCCGCAAGAGCAAACACAACGTGATAGCCGTTGTATTCAGCTACTGTTCCGTCAGCGTTTTGCAGCGGCCAGCTGTCCCACACATCAAGTCCTTTTGCAGACTCAATATTTTTAATCGTTGATTGATCGAATTGAGGCACTTGGTATTTTTCGTTTTGCTGCTGTTTAGGGATCTGCAGCATATCATGGCGTGTAATATGAGAGACGCCGTACGTTTCTTTGTATGCTTTTTGGTTATTTTCTTTCGCGAAGGCTTGAGTCGCTCCTCCTGCCAGAAGTGCAGTCGTAAAAGTCAGAACTGTGGCTTGTTTTACAATTTTTTTGATGTTCATGTTCATGTCTCCTTCTGTATGTACTGTTTTTTGCGATCTGCCGTTTCGATCCTCCCGAATTGACTAGTGGGTAGGCCTGGCGGCCGCCTGGCCGTCGACATTTAGGTGACACTATAGAAGGATCCGCGGAATTCCTTTTTAGATTGAGATAATGACTTTGTTTGGAAGGATGTATTTTTCATTTAATTAAAGCAAATTCGTAATAAT*AAAGTTAAACAATTT*ATTTC*AGATGATTCACAGGTTTGTTGCCTCAAAAG*AAACTTATATTAATGGCAAGTTGTGAATAATTTATGCAACTCTTGTGGGACAAGTTGACTTCACCT'
42
+ assert_equal(read_seq, @read.seq.to_s)
43
+ end
44
+
45
+ def test_read_range
46
+ from = -34
47
+ to = 849
48
+ assert_equal(to, @read.to)
49
+ assert_equal(from, @read.from)
50
+ end
51
+
52
+ def test_read_clear_range
53
+ clear_range_from = 36
54
+ clear_range_to = 862
55
+ assert_equal(clear_range_from, @read.clear_range_from)
56
+ assert_equal(clear_range_to, @read.clear_range_to)
57
+ end
58
+
59
+ def test_read_orientation
60
+ orientation = 'U'
61
+ assert_equal(orientation, @read.orientation)
62
+ end
63
+
64
+ def test_find_reads_in_range
65
+ known_reads = [ '235283518', '235288260', '235293813', '235288255', '235283548' ]
66
+ reads_in_range = @contig.find_reads_in_range(295, 424)
67
+ assert_equal( 5, known_reads.size )
68
+ known_reads.each do |read_name|
69
+ read = Bio::Assembly::Read.new()
70
+ read.name = read_name
71
+ reads_in_range.delete(read)
72
+ end
73
+ assert_equal(0, reads_in_range.size)
74
+ end
75
+
76
+ end
77
+
78
+