bio-assembly 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,169 @@
1
+
2
+ require 'bio/sequence'
3
+ require 'bio-assembly/contig'
4
+ require 'bio-assembly/read'
5
+
6
+ module Bio
7
+
8
+ class Assembly
9
+ attr_accessor :contigs
10
+
11
+ def initialize(path)
12
+ @file = File.new(path, 'r')
13
+ @contigs = Array.new
14
+ parse_as
15
+ end
16
+
17
+ def contigs
18
+ # use each_contig to stream large files
19
+ parse_whole_file if @contigs.empty?
20
+ @contigs
21
+ end
22
+
23
+ def each_contig
24
+ # check if file is already parsed
25
+ if @total_num_contigs.to_i == @contigs.size
26
+ @contigs.each{ |contig| yield contig }
27
+ else
28
+ each_identifier do |identifier, attrs|
29
+ next unless identifier == 'CO'
30
+ contig = parse_contig(attrs)
31
+ @contigs.push contig
32
+ yield(contig)
33
+ end
34
+ end
35
+ end
36
+
37
+ def to_ace
38
+ ace = ""
39
+ ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
40
+ each_contig { |contig| ace += contig.to_ace + "\n" }
41
+ ace
42
+ end
43
+
44
+ private
45
+
46
+ def parse_contig(attrs)
47
+ contig = Bio::Assembly::Contig.new
48
+ contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
49
+ # keep track of the number of RD identifiers parsed
50
+ @num_rds_parsed = 0
51
+
52
+ # get sequence
53
+ seq = @file.gets("\n\n").tr(" \r\n", "")
54
+ contig.seq = seq
55
+
56
+ # loop through identifiers (e.g AF, RD, etc)
57
+ each_identifier do |identifier, attrs|
58
+ case identifier
59
+ when "BQ" then parse_bq(contig)
60
+ when "AF" then parse_af(contig, attrs)
61
+ when "BS" then parse_bs(contig, attrs)
62
+ when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
63
+ when "WR" then parse_wr(contig, attrs)
64
+ when "RT" then parse_rt(contig, attrs)
65
+ when "CT" then parse_ct(contig, attrs)
66
+ when "WA" then parse_wa(contig, attrs)
67
+ end
68
+ end
69
+
70
+ contig
71
+ end
72
+
73
+ # Finds the next_identifier
74
+ def each_identifier
75
+ @file.each do |line|
76
+ next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
77
+ yield(line[0..1], line[3..-1])
78
+ end
79
+ end
80
+
81
+ # parse assembly meta data
82
+ def parse_as
83
+ line = @file.gets
84
+ identifier, @total_num_contigs, total_num_reads = line.split(" ")
85
+ end
86
+
87
+ # parse contig sequence quality data
88
+ def parse_bq(contig)
89
+ contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
90
+ end
91
+
92
+ # parse read meta data
93
+ def parse_af(contig, attrs)
94
+ read = Bio::Assembly::Read.new
95
+ read.name , read.orientation, read.from = attrs.split(" ")
96
+ contig.add_read read
97
+ end
98
+
99
+ # parse base sequence data
100
+ def parse_bs(contig, attrs)
101
+ from, to, read_name = attrs.split(" ")
102
+ read = contig.find_read_by_name( read_name )
103
+ read.add_base_sequence(from, to, read_name)
104
+ end
105
+
106
+ # parse read sequence and position data
107
+ def parse_rd(contig, attrs)
108
+ # increment counter
109
+ @num_rds_parsed += 1
110
+
111
+ # parse read
112
+ read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
113
+ seq = @file.gets("\n\n").tr( " \r\n", "")
114
+
115
+ # get read with matching name
116
+ read = contig.find_read_by_name( read_name )
117
+ read.seq = seq
118
+ read.to = read.from.to_i + read.seq.length
119
+ # set read.to to contig length if read runs off contig
120
+ read.to = contig.seq.length if read.to > contig.seq.length
121
+
122
+ # if present parse QA and DS associated with this read
123
+ each_identifier do |identifier, attrs|
124
+ case identifier
125
+ when "QA" then parse_qa(read, attrs)
126
+ when "DS" then parse_ds(read, attrs); break
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ # parse a read's clear ranges (the part of the read that contributes to the contig)
133
+ def parse_qa(read, attrs)
134
+ start, stop, clear_range_from, clear_range_to = attrs.split(" ")
135
+ read.clear_range_from = clear_range_from
136
+ read.clear_range_to = clear_range_to
137
+ end
138
+
139
+ # parse file data - ignored
140
+ def parse_ds(read, attrs)
141
+ end
142
+
143
+ # parse run meta data - ignored
144
+ def parse_wa(contig, attrs)
145
+ end
146
+
147
+ # parse run meta data - ignored
148
+ def parse_ct(contig, attrs)
149
+ end
150
+
151
+ def num_contigs
152
+ contigs.size
153
+ end
154
+
155
+ def num_reads
156
+ read_num = 0
157
+ each_contig { |contig| read_num += contig.num_reads }
158
+ read_num
159
+ end
160
+
161
+ def parse_whole_file
162
+ each_contig { |x| 1 }
163
+ end
164
+
165
+ end
166
+
167
+ end
168
+
169
+
@@ -0,0 +1,97 @@
1
+ module Bio
2
+ class Assembly
3
+
4
+ class Contig
5
+ attr_accessor :seq, :orientation, :quality, :to, :from, :name, :reads
6
+ alias consensus_seq seq
7
+
8
+ def initialize(str="")
9
+ @reads = Hash.new
10
+ @seq = Bio::Sequence::NA.new(str)
11
+ # counter for RD identifier
12
+ @rds_parsed = 0
13
+ end
14
+
15
+ def find_read_by_name(name)
16
+ @reads[name]
17
+ end
18
+
19
+ def find_reads_in_range(clear_range_from, clear_range_to)
20
+ reads_in_range = Array.new
21
+ each_read do |read|
22
+
23
+ # Read starts in region
24
+ if read.from+read.clear_range_from > clear_range_from and read.from+read.clear_range_from < clear_range_to
25
+ reads_in_range.push read
26
+ # Read ends in region
27
+ elsif read.to+read.clear_range_to < clear_range_to and read.to+read.clear_range_to > clear_range_from
28
+ reads_in_range.push read
29
+ # Read encompasses region
30
+ elsif read.from+read.clear_range_from < clear_range_from and read.to+read.clear_range_to > clear_range_to
31
+ reads_in_range.push read
32
+ end
33
+
34
+ end
35
+ reads_in_range;
36
+ end
37
+
38
+ def add_read(read)
39
+ # TODO do some checks for pos location
40
+ @reads[read.name] = read
41
+ end
42
+
43
+ def each_read
44
+ @reads.each_value { |read| yield read }
45
+ end
46
+
47
+ def num_reads
48
+ @reads.size
49
+ end
50
+
51
+ def num_bases
52
+ seq.length
53
+ end
54
+
55
+ def num_base_segments
56
+ num_base_sequences = 0
57
+ each_read do |read|
58
+ num_base_sequences += read.base_sequences.size unless read.base_sequences.nil?
59
+ end
60
+ num_base_sequences
61
+ end
62
+
63
+ def to_ace
64
+ ace = ""
65
+ ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
66
+ ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
67
+ ace += "BQ\n"
68
+ last_stop = quality.size - 1
69
+ (quality.size/50+1).times do |i|
70
+ start = i * 50
71
+ stop = (i+1) * 50 - 1
72
+ stop = last_stop if stop > last_stop
73
+ ace += ' ' + quality[start..stop].join(' ') + "\n"
74
+ end
75
+ ace += "\n"
76
+
77
+ # holds BS data for reads
78
+ bs_str = ""
79
+ # holds RD, QA, and DS data for reads
80
+ rest_str = ""
81
+ @reads.values.sort.each do |read|
82
+ ace += read.to_ace_af
83
+ bs_str += read.to_ace_bs
84
+ rest_str += read.to_ace_rest
85
+ end
86
+
87
+ # compile data in correct order
88
+ ace += bs_str
89
+ ace += "\n"
90
+ ace += rest_str
91
+ ace
92
+ end
93
+
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,93 @@
1
+
2
+ require 'bio-assembly/read/ace'
3
+
4
+ module Bio
5
+ class Assembly
6
+ class Read
7
+ include Bio::Assembly::Read::Ace
8
+
9
+ attr_accessor :seq, :name, :orientation, :from, :to, :clear_range_from, :clear_range_to
10
+ def initialize(str="")
11
+ @seq = Bio::Sequence::NA.new(str)
12
+ end
13
+
14
+ def ==(other_read)
15
+ name == other_read.name
16
+ end
17
+
18
+ def num_bases
19
+ seq.length
20
+ end
21
+
22
+ def from=(new_from)
23
+ @from = new_from.to_i
24
+ end
25
+
26
+ def to=(new_to)
27
+ @to = new_to.to_i
28
+ end
29
+
30
+ def clear_range_from=(new_clear_range_from)
31
+ @clear_range_from = new_clear_range_from.to_i
32
+ end
33
+
34
+ def clear_range_to=(new_clear_range_to)
35
+ @clear_range_to = new_clear_range_to.to_i
36
+ end
37
+
38
+ def to_ace
39
+ ace += ""
40
+ # holds BS data for reads
41
+ bs_str = ""
42
+ # holds RD, QA, and DS data for reads
43
+ rest_str = ""
44
+ ace += to_ace_af
45
+ bs_str += to_ace_bs
46
+ rest_str = to_ace_rest
47
+
48
+ # compile data in correct order
49
+ ace += bs_str
50
+ ace += "\n"
51
+ ace += rest_str
52
+ ace
53
+ end
54
+
55
+ def <=>(other)
56
+ unless other.kind_of?(Bio::Assembly::Read)
57
+ raise "[Error] markers are not comparable"
58
+ end
59
+ if self.from == other.from
60
+ # sort by to if froms are identical
61
+ return self.to.<=>(other.to)
62
+ else
63
+ return self.from.<=>(other.from)
64
+ end
65
+ end
66
+
67
+ def to_ace_bs
68
+ bs_str = ""
69
+ unless base_sequences.nil?
70
+ base_sequences.each do |bs|
71
+ bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
72
+ end
73
+ end
74
+ bs_str
75
+ end
76
+
77
+ def to_ace_af
78
+ ['AF', name, orientation, from].join(' ') + "\n"
79
+ end
80
+
81
+ def to_ace_rest
82
+ rest_str = ""
83
+ rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
84
+ rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
85
+ rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
86
+ rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
87
+ rest_str
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,39 @@
1
+ module Bio
2
+ class Assembly
3
+ class Read
4
+
5
+ module Ace
6
+ attr_accessor :base_sequences
7
+
8
+ def add_base_sequence(from, to, read_name)
9
+ @base_sequences = Array.new if @base_sequences.nil?
10
+ @base_sequences.push BaseSequence.new(from, to, read_name)
11
+ end
12
+
13
+ class BaseSequence
14
+ attr_accessor :from, :to, :read_name
15
+
16
+ def initialize(from, to, read_name)
17
+ @from = from
18
+ @to = to
19
+ @read_name = read_name
20
+ end
21
+
22
+ def <=>(other)
23
+ unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
24
+ raise "[Error] markers are not comparable"
25
+ end
26
+ if self.from == other.from
27
+ # sort by to if froms are identical
28
+ return self.to.<=>(other.to)
29
+ else
30
+ return self.from.<=>(other.from)
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-assembly'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,78 @@
1
+ require 'helper'
2
+
3
+ class TestBioAssembly < Test::Unit::TestCase
4
+
5
+ def setup
6
+ ace_filename = File.join('data', 'example1.ace')
7
+ @obj = Bio::Assembly.new(ace_filename)
8
+
9
+ # pick a contig to do in depth tests on
10
+ @contig = nil
11
+ @obj.each_contig { |c| @contig = c if c.name.to_i == 5 }
12
+
13
+ # pick a read to do in depth tests on
14
+ @read = nil
15
+ @contig.each_read{ |r| @read = r if r.name == '235283518' }
16
+ end
17
+
18
+ def test_num_contigs_parsed
19
+ contigs_parsed = 13
20
+ assert_equal(contigs_parsed, @obj.contigs.size)
21
+ end
22
+
23
+ def test_num_reads_parsed
24
+ reads_parsed_known = 1760
25
+ reads_parsed = 0
26
+ @obj.contigs.each { |c| reads_parsed += c.reads.size }
27
+ assert_equal(reads_parsed_known, reads_parsed)
28
+ end
29
+
30
+ def test_contig_num_reads
31
+ num_reads = 15
32
+ assert_equal(num_reads, @contig.reads.size )
33
+ end
34
+
35
+ def test_contig_seq
36
+ seq = "TTTCCGTCAGATGTAAAGGTTGCAGAACCGGACCATTCTTGCGTCTGATCTTTCAGGATCGGATCGTTGGCGTCGAACTTATCGCTGTCTTTAAAGACACGGCCCGCGTTTTTCCAGCTGTCGATTGAGTTGTCGCCGACCTTTTGATAAAACATGTAGATTGATGTGTCATCAGCGTCTTTCGGGCTTCCCGCAAGAGCAAACACAACGTGATAGCCGTTGTATTCAGCTACTGTTCCGTCAGCGTTTTGCAGCGGCCAGCTGTCCCACACATCAAGTCCTTTTGCAGACTCAATATTTTTAATCGTTGATTGATCGAATTGAGGCACTTGGTATTTTTCGTTTTGCTGCTGTTTAGGGATCTGCAGCATATCATGGCGTGTAATATGAGAGACGCCGTACGTTTCTTTGTATGCTTTTTGGTTATTTTCTTTCGCGAAGGCTTGAGTCGCTCCTCCTGCCAGAAGTGCAGTCGTAAAAGTCAGAACTGTGGCTTGTTTTACAATTTTTTTGATGTTCATGTTCATGTCTCCTTCTGTATGTACTGTTTTTTGCGATCTGCCGTTTCGATCCTCCCGAATTGACTAGTGGGTAGGCCTGGCGGCCGCCTGGCCGTCGACATTTAGGTGACACTATAGAAGGATCCGCGGAATTCCTTTTTAGATTGAGATAATGACTTTGTTTGGAAGGATGTA*TTTTCATTTAATTAAAGCAAATTCGTAATAAT*AAAGTTAAACAATTTAATTTCAAGATGATTCACAGGTTTGTTGCCTCAAAAGAAAACTTATATTAATGGCAAGTTGTGAATAATTTATGCAACTCTTGTGGACAAGTTGACTCAACTTTTCAC*TTTATGTTATATTGTAAGGATGTGACTTTGTTTTGGAAAATTATATTTAATTTGATAATTAACCAATATAAAAAAGATAAACCAAAAGCTATAAGTCGTAAATAAGGACATTGGAAACAAGAAATATTCTCTCCTGAACATTATTTTAAATTATGCGCAATATGCAAATTTATAAGTGTTAAGTTAAAAAGATTGTTAATGGTTCTGTTTATTACCCAAAGACTTTTTTAAAGTTTAAGTCGTTGCTAAGAGTGCAGCGTTTAGACAAATAAAAATGCAATAATCTTCTCGCTCGGGAGCTATGTCCCTCGCATAATATTCTTCAAAGTGTACAGTAAATATTCTAGAAAAGTGAAGTGTGAAAAAGATATATTGCTTGTTTTTATATTTTGTTAATACAACAAAACTTCAAAAACCTGCGGTGGGGGGGGGGGGATAGTCACTTCCGTCACCTTCACCCCTCTCGTTCACTATACTCCCTCGCCCTGGCGTAATGATGGGGGGATTGGGGGTAGTTGCCCCTTAATAAAGTTCAAACTTGATTTATTTCTAACTCGATACCAGTGATTTACAAATGTTTCTGAAATGGCATGGTTTTCCCTAATAAATGCCTAAAAACCCTGAGCTGAGCCCACGCCAATT"
37
+ assert_equal(seq, @contig.seq.to_s)
38
+ end
39
+
40
+ def test_read_seq
41
+ read_seq = 'GAAAAAAAAAGGCAGAAGTTTAATCAAAACGGATTTTTCCGTCAGATGTAAAGGTTGCAGAACCGGACCATTCTTGCGTCTGATCTTTCAGGATCGGATCGTTGGCGTCGAACTTATCGCTGTCTTTAAAGACACGGCCCGCGTTTTTCCAGCTGTCGATTGAGTTGTCGCCGACCTTTTGATAAAACATGTAGATTGATGTGTCATCAGCGTCTTTCGGGCTTCCCGCAAGAGCAAACACAACGTGATAGCCGTTGTATTCAGCTACTGTTCCGTCAGCGTTTTGCAGCGGCCAGCTGTCCCACACATCAAGTCCTTTTGCAGACTCAATATTTTTAATCGTTGATTGATCGAATTGAGGCACTTGGTATTTTTCGTTTTGCTGCTGTTTAGGGATCTGCAGCATATCATGGCGTGTAATATGAGAGACGCCGTACGTTTCTTTGTATGCTTTTTGGTTATTTTCTTTCGCGAAGGCTTGAGTCGCTCCTCCTGCCAGAAGTGCAGTCGTAAAAGTCAGAACTGTGGCTTGTTTTACAATTTTTTTGATGTTCATGTTCATGTCTCCTTCTGTATGTACTGTTTTTTGCGATCTGCCGTTTCGATCCTCCCGAATTGACTAGTGGGTAGGCCTGGCGGCCGCCTGGCCGTCGACATTTAGGTGACACTATAGAAGGATCCGCGGAATTCCTTTTTAGATTGAGATAATGACTTTGTTTGGAAGGATGTATTTTTCATTTAATTAAAGCAAATTCGTAATAAT*AAAGTTAAACAATTT*ATTTC*AGATGATTCACAGGTTTGTTGCCTCAAAAG*AAACTTATATTAATGGCAAGTTGTGAATAATTTATGCAACTCTTGTGGGACAAGTTGACTTCACCT'
42
+ assert_equal(read_seq, @read.seq.to_s)
43
+ end
44
+
45
+ def test_read_range
46
+ from = -34
47
+ to = 849
48
+ assert_equal(to, @read.to)
49
+ assert_equal(from, @read.from)
50
+ end
51
+
52
+ def test_read_clear_range
53
+ clear_range_from = 36
54
+ clear_range_to = 862
55
+ assert_equal(clear_range_from, @read.clear_range_from)
56
+ assert_equal(clear_range_to, @read.clear_range_to)
57
+ end
58
+
59
+ def test_read_orientation
60
+ orientation = 'U'
61
+ assert_equal(orientation, @read.orientation)
62
+ end
63
+
64
+ def test_find_reads_in_range
65
+ known_reads = [ '235283518', '235288260', '235293813', '235288255', '235283548' ]
66
+ reads_in_range = @contig.find_reads_in_range(295, 424)
67
+ assert_equal( 5, known_reads.size )
68
+ known_reads.each do |read_name|
69
+ read = Bio::Assembly::Read.new()
70
+ read.name = read_name
71
+ reads_in_range.delete(read)
72
+ end
73
+ assert_equal(0, reads_in_range.size)
74
+ end
75
+
76
+ end
77
+
78
+