bio-assembly 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,6 +1,15 @@
1
1
  = bio-assembly
2
2
 
3
- Description goes here.
3
+ BioRuby plugin for parsing, writing, and maniuplating assembly data
4
+
5
+ == Install
6
+
7
+ gem install bio-assembly
8
+
9
+ == Usage
10
+
11
+ Examples on my blog:
12
+ http://chasemiller4.blogspot.com/2010/10/bioruby-ace-parser-example.html
4
13
 
5
14
  == Contributing to bio-assembly
6
15
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
data/bio-assembly.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-assembly}
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Chase Miller"]
@@ -27,9 +27,9 @@ Gem::Specification.new do |s|
27
27
  "bio-assembly.gemspec",
28
28
  "data/example1.ace",
29
29
  "lib/bio-assembly.rb",
30
+ "lib/bio-assembly/ace.rb",
30
31
  "lib/bio-assembly/contig.rb",
31
32
  "lib/bio-assembly/read.rb",
32
- "lib/bio-assembly/read/ace.rb",
33
33
  "test/helper.rb",
34
34
  "test/test_bio-assembly.rb"
35
35
  ]
data/lib/bio-assembly.rb CHANGED
@@ -1,169 +1,55 @@
1
-
2
- require 'bio/sequence'
1
+ require 'bio/sequence'
3
2
  require 'bio-assembly/contig'
4
3
  require 'bio-assembly/read'
5
4
 
6
5
  module Bio
7
6
 
8
- class Assembly
9
- attr_accessor :contigs
10
-
11
- def initialize(path)
12
- @file = File.new(path, 'r')
13
- @contigs = Array.new
14
- parse_as
15
- end
16
-
17
- def contigs
18
- # use each_contig to stream large files
19
- parse_whole_file if @contigs.empty?
20
- @contigs
21
- end
22
-
23
- def each_contig
24
- # check if file is already parsed
25
- if @total_num_contigs.to_i == @contigs.size
26
- @contigs.each{ |contig| yield contig }
27
- else
28
- each_identifier do |identifier, attrs|
29
- next unless identifier == 'CO'
30
- contig = parse_contig(attrs)
31
- @contigs.push contig
32
- yield(contig)
33
- end
7
+ class Assembly
8
+ attr_accessor :contigs
9
+
10
+ @@formats = { }
11
+
12
+ def self.create(path, format)
13
+ streamer = @@formats[format]
14
+ if streamer
15
+ streamer.new(path)
16
+ else
17
+ raise "Format type '#{format}' is not supported"
18
+ end
34
19
  end
35
- end
36
-
37
- def to_ace
38
- ace = ""
39
- ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
40
- each_contig { |contig| ace += contig.to_ace + "\n" }
41
- ace
42
- end
43
-
44
- private
45
-
46
- def parse_contig(attrs)
47
- contig = Bio::Assembly::Contig.new
48
- contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
49
- # keep track of the number of RD identifiers parsed
50
- @num_rds_parsed = 0
51
20
 
52
- # get sequence
53
- seq = @file.gets("\n\n").tr(" \r\n", "")
54
- contig.seq = seq
55
-
56
- # loop through identifiers (e.g AF, RD, etc)
57
- each_identifier do |identifier, attrs|
58
- case identifier
59
- when "BQ" then parse_bq(contig)
60
- when "AF" then parse_af(contig, attrs)
61
- when "BS" then parse_bs(contig, attrs)
62
- when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
63
- when "WR" then parse_wr(contig, attrs)
64
- when "RT" then parse_rt(contig, attrs)
65
- when "CT" then parse_ct(contig, attrs)
66
- when "WA" then parse_wa(contig, attrs)
67
- end
21
+ def self.register_parser name
22
+ @@formats[name] = self
68
23
  end
69
-
70
- contig
71
- end
72
-
73
- # Finds the next_identifier
74
- def each_identifier
75
- @file.each do |line|
76
- next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
77
- yield(line[0..1], line[3..-1])
24
+
25
+ def contigs
26
+ # use each_contig to stream large files
27
+ parse_whole_file if @contigs.empty?
28
+ @contigs
78
29
  end
79
- end
80
-
81
- # parse assembly meta data
82
- def parse_as
83
- line = @file.gets
84
- identifier, @total_num_contigs, total_num_reads = line.split(" ")
85
- end
86
-
87
- # parse contig sequence quality data
88
- def parse_bq(contig)
89
- contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
90
- end
91
-
92
- # parse read meta data
93
- def parse_af(contig, attrs)
94
- read = Bio::Assembly::Read.new
95
- read.name , read.orientation, read.from = attrs.split(" ")
96
- contig.add_read read
97
- end
98
-
99
- # parse base sequence data
100
- def parse_bs(contig, attrs)
101
- from, to, read_name = attrs.split(" ")
102
- read = contig.find_read_by_name( read_name )
103
- read.add_base_sequence(from, to, read_name)
104
- end
105
-
106
- # parse read sequence and position data
107
- def parse_rd(contig, attrs)
108
- # increment counter
109
- @num_rds_parsed += 1
110
30
 
111
- # parse read
112
- read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
113
- seq = @file.gets("\n\n").tr( " \r\n", "")
114
-
115
- # get read with matching name
116
- read = contig.find_read_by_name( read_name )
117
- read.seq = seq
118
- read.to = read.from.to_i + read.seq.length
119
- # set read.to to contig length if read runs off contig
120
- read.to = contig.seq.length if read.to > contig.seq.length
121
-
122
- # if present parse QA and DS associated with this read
123
- each_identifier do |identifier, attrs|
124
- case identifier
125
- when "QA" then parse_qa(read, attrs)
126
- when "DS" then parse_ds(read, attrs); break
127
- end
31
+ def each_contig
32
+ # implemented by each format subclass
33
+ end
34
+
35
+ private
36
+
37
+ def num_contigs
38
+ contigs.size
128
39
  end
129
-
130
- end
131
-
132
- # parse a read's clear ranges (the part of the read that contributes to the contig)
133
- def parse_qa(read, attrs)
134
- start, stop, clear_range_from, clear_range_to = attrs.split(" ")
135
- read.clear_range_from = clear_range_from
136
- read.clear_range_to = clear_range_to
137
- end
138
-
139
- # parse file data - ignored
140
- def parse_ds(read, attrs)
141
- end
142
-
143
- # parse run meta data - ignored
144
- def parse_wa(contig, attrs)
145
- end
146
-
147
- # parse run meta data - ignored
148
- def parse_ct(contig, attrs)
149
- end
150
-
151
- def num_contigs
152
- contigs.size
153
- end
154
-
155
- def num_reads
156
- read_num = 0
157
- each_contig { |contig| read_num += contig.num_reads }
158
- read_num
159
- end
160
-
161
- def parse_whole_file
162
- each_contig { |x| 1 }
163
- end
164
-
165
- end
166
40
 
167
- end
41
+ def num_reads
42
+ read_num = 0
43
+ each_contig { |contig| read_num += contig.num_reads }
44
+ read_num
45
+ end
168
46
 
47
+ def parse_whole_file
48
+ each_contig { |x| 1 }
49
+ end
50
+
51
+ end
52
+
53
+ end
169
54
 
55
+ require 'bio-assembly/ace'
@@ -0,0 +1,258 @@
1
+
2
+ module Bio
3
+ class Assembly
4
+
5
+ class Ace < Bio::Assembly
6
+
7
+ # register parser with superclass
8
+ register_parser :ace
9
+
10
+ def initialize(path)
11
+ @file = File.new(path, 'r')
12
+ @contigs = Array.new
13
+ parse_as
14
+ end
15
+
16
+ def each_contig
17
+ # check if file is already parsed
18
+ if @total_num_contigs.to_i == @contigs.size
19
+ @contigs.each{ |contig| yield contig }
20
+ else
21
+ each_identifier do |identifier, attrs|
22
+ next unless identifier == 'CO'
23
+ contig = parse_contig(attrs)
24
+ @contigs.push contig
25
+ yield(contig)
26
+ end
27
+ end
28
+ end
29
+
30
+ def to_ace
31
+ ace = ""
32
+ ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
33
+ each_contig { |contig| ace += contig.to_ace + "\n" }
34
+ ace
35
+ end
36
+
37
+ private
38
+ def parse_contig(attrs)
39
+ contig = Bio::Assembly::Contig.new
40
+ contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
41
+ # keep track of the number of RD identifiers parsed
42
+ @num_rds_parsed = 0
43
+
44
+ # get sequence
45
+ seq = @file.gets("\n\n").tr(" \r\n", "")
46
+ contig.seq = seq
47
+
48
+ # loop through identifiers (e.g AF, RD, etc)
49
+ each_identifier do |identifier, attrs|
50
+ case identifier
51
+ when "BQ" then parse_bq(contig)
52
+ when "AF" then parse_af(contig, attrs)
53
+ when "BS" then parse_bs(contig, attrs)
54
+ when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
55
+ when "WR" then parse_wr(contig, attrs)
56
+ when "RT" then parse_rt(contig, attrs)
57
+ when "CT" then parse_ct(contig, attrs)
58
+ when "WA" then parse_wa(contig, attrs)
59
+ end
60
+ end
61
+
62
+ contig
63
+ end
64
+
65
+ # Finds the next_identifier
66
+ def each_identifier
67
+ @file.each do |line|
68
+ next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
69
+ yield(line[0..1], line[3..-1])
70
+ end
71
+ end
72
+
73
+ # parse assembly meta data
74
+ def parse_as
75
+ line = @file.gets
76
+ identifier, @total_num_contigs, total_num_reads = line.split(" ")
77
+ end
78
+
79
+ # parse contig sequence quality data
80
+ def parse_bq(contig)
81
+ contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
82
+ end
83
+
84
+ # parse read meta data
85
+ def parse_af(contig, attrs)
86
+ read = Bio::Assembly::Read.new
87
+ read.name , read.orientation, read.from = attrs.split(" ")
88
+ contig.add_read read
89
+ end
90
+
91
+ # parse base sequence data
92
+ def parse_bs(contig, attrs)
93
+ from, to, read_name = attrs.split(" ")
94
+ read = contig.find_read_by_name( read_name )
95
+ read.add_base_sequence(from, to, read_name)
96
+ end
97
+
98
+ # parse read sequence and position data
99
+ def parse_rd(contig, attrs)
100
+ # increment counter
101
+ @num_rds_parsed += 1
102
+
103
+ # parse read
104
+ read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
105
+ seq = @file.gets("\n\n").tr( " \r\n", "")
106
+
107
+ # get read with matching name
108
+ read = contig.find_read_by_name( read_name )
109
+ read.seq = seq
110
+ read.to = read.from.to_i + read.seq.length
111
+ # set read.to to contig length if read runs off contig
112
+ read.to = contig.seq.length if read.to > contig.seq.length
113
+
114
+ # if present parse QA and DS associated with this read
115
+ each_identifier do |identifier, attrs|
116
+ case identifier
117
+ when "QA" then parse_qa(read, attrs)
118
+ when "DS" then parse_ds(read, attrs); break
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+ # parse a read's clear ranges (the part of the read that contributes to the contig)
125
+ def parse_qa(read, attrs)
126
+ start, stop, clear_range_from, clear_range_to = attrs.split(" ")
127
+ read.clear_range_from = clear_range_from
128
+ read.clear_range_to = clear_range_to
129
+ end
130
+
131
+ # parse file data - ignored
132
+ def parse_ds(read, attrs)
133
+ end
134
+
135
+ # parse run meta data - ignored
136
+ def parse_wa(contig, attrs)
137
+ end
138
+
139
+ # parse run meta data - ignored
140
+ def parse_ct(contig, attrs)
141
+ end
142
+
143
+ end # => end class Ace
144
+
145
+ # open contig class and write ace specific methods for contig objects
146
+ class Contig
147
+
148
+ def to_ace
149
+ ace = ""
150
+ ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
151
+ ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
152
+ ace += "BQ\n"
153
+ last_stop = quality.size - 1
154
+ (quality.size/50+1).times do |i|
155
+ start = i * 50
156
+ stop = (i+1) * 50 - 1
157
+ stop = last_stop if stop > last_stop
158
+ ace += ' ' + quality[start..stop].join(' ') + "\n"
159
+ end
160
+ ace += "\n"
161
+
162
+ # holds BS data for reads
163
+ bs_str = ""
164
+ # holds RD, QA, and DS data for reads
165
+ rest_str = ""
166
+ @reads.values.sort.each do |read|
167
+ ace += read.to_ace_af
168
+ bs_str += read.to_ace_bs
169
+ rest_str += read.to_ace_rest
170
+ end
171
+
172
+ # compile data in correct order
173
+ ace += bs_str
174
+ ace += "\n"
175
+ ace += rest_str
176
+ ace
177
+ end
178
+
179
+ end # => end Contig class
180
+
181
+ # open Read class to add ace specific methods for read objects
182
+ class Read
183
+
184
+ attr_accessor :base_sequences
185
+
186
+ def to_ace
187
+ ace += ""
188
+ # holds BS data for reads
189
+ bs_str = ""
190
+ # holds RD, QA, and DS data for reads
191
+ rest_str = ""
192
+ ace += to_ace_af
193
+ bs_str += to_ace_bs
194
+ rest_str = to_ace_rest
195
+
196
+ # compile data in correct order
197
+ ace += bs_str
198
+ ace += "\n"
199
+ ace += rest_str
200
+ ace
201
+ end
202
+
203
+ def to_ace_bs
204
+ bs_str = ""
205
+ unless base_sequences.nil?
206
+ base_sequences.each do |bs|
207
+ bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
208
+ end
209
+ end
210
+ bs_str
211
+ end
212
+
213
+ def to_ace_af
214
+ ['AF', name, orientation, from].join(' ') + "\n"
215
+ end
216
+
217
+ def to_ace_rest
218
+ rest_str = ""
219
+ rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
220
+ rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
221
+ rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
222
+ rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
223
+ rest_str
224
+ end
225
+
226
+ def add_base_sequence(from, to, read_name)
227
+ @base_sequences = Array.new if @base_sequences.nil?
228
+ @base_sequences.push BaseSequence.new(from, to, read_name)
229
+ end
230
+
231
+ class BaseSequence
232
+ attr_accessor :from, :to, :read_name
233
+
234
+ def initialize(from, to, read_name)
235
+ @from = from
236
+ @to = to
237
+ @read_name = read_name
238
+ end
239
+
240
+ def <=>(other)
241
+ unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
242
+ raise "[Error] markers are not comparable"
243
+ end
244
+ if self.from == other.from
245
+ # sort by to if froms are identical
246
+ return self.to.<=>(other.to)
247
+ else
248
+ return self.from.<=>(other.from)
249
+ end
250
+ end
251
+
252
+ end # => end BaseSequence Class
253
+
254
+ end # => end Read Class
255
+
256
+
257
+ end # => end class Assembly
258
+ end # => end module Bio
@@ -1,3 +1,5 @@
1
+ #require 'bio-assembly/contig/ace'
2
+
1
3
  module Bio
2
4
  class Assembly
3
5
 
@@ -59,39 +61,9 @@ module Bio
59
61
  end
60
62
  num_base_sequences
61
63
  end
62
-
63
- def to_ace
64
- ace = ""
65
- ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
66
- ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
67
- ace += "BQ\n"
68
- last_stop = quality.size - 1
69
- (quality.size/50+1).times do |i|
70
- start = i * 50
71
- stop = (i+1) * 50 - 1
72
- stop = last_stop if stop > last_stop
73
- ace += ' ' + quality[start..stop].join(' ') + "\n"
74
- end
75
- ace += "\n"
76
-
77
- # holds BS data for reads
78
- bs_str = ""
79
- # holds RD, QA, and DS data for reads
80
- rest_str = ""
81
- @reads.values.sort.each do |read|
82
- ace += read.to_ace_af
83
- bs_str += read.to_ace_bs
84
- rest_str += read.to_ace_rest
85
- end
86
-
87
- # compile data in correct order
88
- ace += bs_str
89
- ace += "\n"
90
- ace += rest_str
91
- ace
92
- end
93
64
 
94
65
  end
95
66
 
96
67
  end
97
- end
68
+ end
69
+
@@ -1,10 +1,9 @@
1
1
 
2
- require 'bio-assembly/read/ace'
2
+ #require 'bio-assembly/read/ace'
3
3
 
4
4
  module Bio
5
5
  class Assembly
6
6
  class Read
7
- include Bio::Assembly::Read::Ace
8
7
 
9
8
  attr_accessor :seq, :name, :orientation, :from, :to, :clear_range_from, :clear_range_to
10
9
  def initialize(str="")
@@ -34,24 +33,7 @@ module Bio
34
33
  def clear_range_to=(new_clear_range_to)
35
34
  @clear_range_to = new_clear_range_to.to_i
36
35
  end
37
-
38
- def to_ace
39
- ace += ""
40
- # holds BS data for reads
41
- bs_str = ""
42
- # holds RD, QA, and DS data for reads
43
- rest_str = ""
44
- ace += to_ace_af
45
- bs_str += to_ace_bs
46
- rest_str = to_ace_rest
47
-
48
- # compile data in correct order
49
- ace += bs_str
50
- ace += "\n"
51
- ace += rest_str
52
- ace
53
- end
54
-
36
+
55
37
  def <=>(other)
56
38
  unless other.kind_of?(Bio::Assembly::Read)
57
39
  raise "[Error] markers are not comparable"
@@ -64,29 +46,6 @@ module Bio
64
46
  end
65
47
  end
66
48
 
67
- def to_ace_bs
68
- bs_str = ""
69
- unless base_sequences.nil?
70
- base_sequences.each do |bs|
71
- bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
72
- end
73
- end
74
- bs_str
75
- end
76
-
77
- def to_ace_af
78
- ['AF', name, orientation, from].join(' ') + "\n"
79
- end
80
-
81
- def to_ace_rest
82
- rest_str = ""
83
- rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
84
- rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
85
- rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
86
- rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
87
- rest_str
88
- end
89
-
90
49
  end
91
50
 
92
51
  end
@@ -4,7 +4,7 @@ class TestBioAssembly < Test::Unit::TestCase
4
4
 
5
5
  def setup
6
6
  ace_filename = File.join('data', 'example1.ace')
7
- @obj = Bio::Assembly.new(ace_filename)
7
+ @obj = Bio::Assembly.create(ace_filename, :ace)
8
8
 
9
9
  # pick a contig to do in depth tests on
10
10
  @contig = nil
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-assembly
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 0
10
- version: 0.0.0
9
+ - 1
10
+ version: 0.0.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Chase Miller
@@ -114,9 +114,9 @@ files:
114
114
  - bio-assembly.gemspec
115
115
  - data/example1.ace
116
116
  - lib/bio-assembly.rb
117
+ - lib/bio-assembly/ace.rb
117
118
  - lib/bio-assembly/contig.rb
118
119
  - lib/bio-assembly/read.rb
119
- - lib/bio-assembly/read/ace.rb
120
120
  - test/helper.rb
121
121
  - test/test_bio-assembly.rb
122
122
  has_rdoc: true
@@ -1,39 +0,0 @@
1
- module Bio
2
- class Assembly
3
- class Read
4
-
5
- module Ace
6
- attr_accessor :base_sequences
7
-
8
- def add_base_sequence(from, to, read_name)
9
- @base_sequences = Array.new if @base_sequences.nil?
10
- @base_sequences.push BaseSequence.new(from, to, read_name)
11
- end
12
-
13
- class BaseSequence
14
- attr_accessor :from, :to, :read_name
15
-
16
- def initialize(from, to, read_name)
17
- @from = from
18
- @to = to
19
- @read_name = read_name
20
- end
21
-
22
- def <=>(other)
23
- unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
24
- raise "[Error] markers are not comparable"
25
- end
26
- if self.from == other.from
27
- # sort by to if froms are identical
28
- return self.to.<=>(other.to)
29
- else
30
- return self.from.<=>(other.from)
31
- end
32
- end
33
-
34
- end
35
-
36
- end
37
- end
38
- end
39
- end