bio-assembly 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -1,6 +1,15 @@
1
1
  = bio-assembly
2
2
 
3
- Description goes here.
3
+ BioRuby plugin for parsing, writing, and maniuplating assembly data
4
+
5
+ == Install
6
+
7
+ gem install bio-assembly
8
+
9
+ == Usage
10
+
11
+ Examples on my blog:
12
+ http://chasemiller4.blogspot.com/2010/10/bioruby-ace-parser-example.html
4
13
 
5
14
  == Contributing to bio-assembly
6
15
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
data/bio-assembly.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{bio-assembly}
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Chase Miller"]
@@ -27,9 +27,9 @@ Gem::Specification.new do |s|
27
27
  "bio-assembly.gemspec",
28
28
  "data/example1.ace",
29
29
  "lib/bio-assembly.rb",
30
+ "lib/bio-assembly/ace.rb",
30
31
  "lib/bio-assembly/contig.rb",
31
32
  "lib/bio-assembly/read.rb",
32
- "lib/bio-assembly/read/ace.rb",
33
33
  "test/helper.rb",
34
34
  "test/test_bio-assembly.rb"
35
35
  ]
data/lib/bio-assembly.rb CHANGED
@@ -1,169 +1,55 @@
1
-
2
- require 'bio/sequence'
1
+ require 'bio/sequence'
3
2
  require 'bio-assembly/contig'
4
3
  require 'bio-assembly/read'
5
4
 
6
5
  module Bio
7
6
 
8
- class Assembly
9
- attr_accessor :contigs
10
-
11
- def initialize(path)
12
- @file = File.new(path, 'r')
13
- @contigs = Array.new
14
- parse_as
15
- end
16
-
17
- def contigs
18
- # use each_contig to stream large files
19
- parse_whole_file if @contigs.empty?
20
- @contigs
21
- end
22
-
23
- def each_contig
24
- # check if file is already parsed
25
- if @total_num_contigs.to_i == @contigs.size
26
- @contigs.each{ |contig| yield contig }
27
- else
28
- each_identifier do |identifier, attrs|
29
- next unless identifier == 'CO'
30
- contig = parse_contig(attrs)
31
- @contigs.push contig
32
- yield(contig)
33
- end
7
+ class Assembly
8
+ attr_accessor :contigs
9
+
10
+ @@formats = { }
11
+
12
+ def self.create(path, format)
13
+ streamer = @@formats[format]
14
+ if streamer
15
+ streamer.new(path)
16
+ else
17
+ raise "Format type '#{format}' is not supported"
18
+ end
34
19
  end
35
- end
36
-
37
- def to_ace
38
- ace = ""
39
- ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
40
- each_contig { |contig| ace += contig.to_ace + "\n" }
41
- ace
42
- end
43
-
44
- private
45
-
46
- def parse_contig(attrs)
47
- contig = Bio::Assembly::Contig.new
48
- contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
49
- # keep track of the number of RD identifiers parsed
50
- @num_rds_parsed = 0
51
20
 
52
- # get sequence
53
- seq = @file.gets("\n\n").tr(" \r\n", "")
54
- contig.seq = seq
55
-
56
- # loop through identifiers (e.g AF, RD, etc)
57
- each_identifier do |identifier, attrs|
58
- case identifier
59
- when "BQ" then parse_bq(contig)
60
- when "AF" then parse_af(contig, attrs)
61
- when "BS" then parse_bs(contig, attrs)
62
- when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
63
- when "WR" then parse_wr(contig, attrs)
64
- when "RT" then parse_rt(contig, attrs)
65
- when "CT" then parse_ct(contig, attrs)
66
- when "WA" then parse_wa(contig, attrs)
67
- end
21
+ def self.register_parser name
22
+ @@formats[name] = self
68
23
  end
69
-
70
- contig
71
- end
72
-
73
- # Finds the next_identifier
74
- def each_identifier
75
- @file.each do |line|
76
- next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
77
- yield(line[0..1], line[3..-1])
24
+
25
+ def contigs
26
+ # use each_contig to stream large files
27
+ parse_whole_file if @contigs.empty?
28
+ @contigs
78
29
  end
79
- end
80
-
81
- # parse assembly meta data
82
- def parse_as
83
- line = @file.gets
84
- identifier, @total_num_contigs, total_num_reads = line.split(" ")
85
- end
86
-
87
- # parse contig sequence quality data
88
- def parse_bq(contig)
89
- contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
90
- end
91
-
92
- # parse read meta data
93
- def parse_af(contig, attrs)
94
- read = Bio::Assembly::Read.new
95
- read.name , read.orientation, read.from = attrs.split(" ")
96
- contig.add_read read
97
- end
98
-
99
- # parse base sequence data
100
- def parse_bs(contig, attrs)
101
- from, to, read_name = attrs.split(" ")
102
- read = contig.find_read_by_name( read_name )
103
- read.add_base_sequence(from, to, read_name)
104
- end
105
-
106
- # parse read sequence and position data
107
- def parse_rd(contig, attrs)
108
- # increment counter
109
- @num_rds_parsed += 1
110
30
 
111
- # parse read
112
- read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
113
- seq = @file.gets("\n\n").tr( " \r\n", "")
114
-
115
- # get read with matching name
116
- read = contig.find_read_by_name( read_name )
117
- read.seq = seq
118
- read.to = read.from.to_i + read.seq.length
119
- # set read.to to contig length if read runs off contig
120
- read.to = contig.seq.length if read.to > contig.seq.length
121
-
122
- # if present parse QA and DS associated with this read
123
- each_identifier do |identifier, attrs|
124
- case identifier
125
- when "QA" then parse_qa(read, attrs)
126
- when "DS" then parse_ds(read, attrs); break
127
- end
31
+ def each_contig
32
+ # implemented by each format subclass
33
+ end
34
+
35
+ private
36
+
37
+ def num_contigs
38
+ contigs.size
128
39
  end
129
-
130
- end
131
-
132
- # parse a read's clear ranges (the part of the read that contributes to the contig)
133
- def parse_qa(read, attrs)
134
- start, stop, clear_range_from, clear_range_to = attrs.split(" ")
135
- read.clear_range_from = clear_range_from
136
- read.clear_range_to = clear_range_to
137
- end
138
-
139
- # parse file data - ignored
140
- def parse_ds(read, attrs)
141
- end
142
-
143
- # parse run meta data - ignored
144
- def parse_wa(contig, attrs)
145
- end
146
-
147
- # parse run meta data - ignored
148
- def parse_ct(contig, attrs)
149
- end
150
-
151
- def num_contigs
152
- contigs.size
153
- end
154
-
155
- def num_reads
156
- read_num = 0
157
- each_contig { |contig| read_num += contig.num_reads }
158
- read_num
159
- end
160
-
161
- def parse_whole_file
162
- each_contig { |x| 1 }
163
- end
164
-
165
- end
166
40
 
167
- end
41
+ def num_reads
42
+ read_num = 0
43
+ each_contig { |contig| read_num += contig.num_reads }
44
+ read_num
45
+ end
168
46
 
47
+ def parse_whole_file
48
+ each_contig { |x| 1 }
49
+ end
50
+
51
+ end
52
+
53
+ end
169
54
 
55
+ require 'bio-assembly/ace'
@@ -0,0 +1,258 @@
1
+
2
+ module Bio
3
+ class Assembly
4
+
5
+ class Ace < Bio::Assembly
6
+
7
+ # register parser with superclass
8
+ register_parser :ace
9
+
10
+ def initialize(path)
11
+ @file = File.new(path, 'r')
12
+ @contigs = Array.new
13
+ parse_as
14
+ end
15
+
16
+ def each_contig
17
+ # check if file is already parsed
18
+ if @total_num_contigs.to_i == @contigs.size
19
+ @contigs.each{ |contig| yield contig }
20
+ else
21
+ each_identifier do |identifier, attrs|
22
+ next unless identifier == 'CO'
23
+ contig = parse_contig(attrs)
24
+ @contigs.push contig
25
+ yield(contig)
26
+ end
27
+ end
28
+ end
29
+
30
+ def to_ace
31
+ ace = ""
32
+ ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
33
+ each_contig { |contig| ace += contig.to_ace + "\n" }
34
+ ace
35
+ end
36
+
37
+ private
38
+ def parse_contig(attrs)
39
+ contig = Bio::Assembly::Contig.new
40
+ contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
41
+ # keep track of the number of RD identifiers parsed
42
+ @num_rds_parsed = 0
43
+
44
+ # get sequence
45
+ seq = @file.gets("\n\n").tr(" \r\n", "")
46
+ contig.seq = seq
47
+
48
+ # loop through identifiers (e.g AF, RD, etc)
49
+ each_identifier do |identifier, attrs|
50
+ case identifier
51
+ when "BQ" then parse_bq(contig)
52
+ when "AF" then parse_af(contig, attrs)
53
+ when "BS" then parse_bs(contig, attrs)
54
+ when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
55
+ when "WR" then parse_wr(contig, attrs)
56
+ when "RT" then parse_rt(contig, attrs)
57
+ when "CT" then parse_ct(contig, attrs)
58
+ when "WA" then parse_wa(contig, attrs)
59
+ end
60
+ end
61
+
62
+ contig
63
+ end
64
+
65
+ # Finds the next_identifier
66
+ def each_identifier
67
+ @file.each do |line|
68
+ next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
69
+ yield(line[0..1], line[3..-1])
70
+ end
71
+ end
72
+
73
+ # parse assembly meta data
74
+ def parse_as
75
+ line = @file.gets
76
+ identifier, @total_num_contigs, total_num_reads = line.split(" ")
77
+ end
78
+
79
+ # parse contig sequence quality data
80
+ def parse_bq(contig)
81
+ contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
82
+ end
83
+
84
+ # parse read meta data
85
+ def parse_af(contig, attrs)
86
+ read = Bio::Assembly::Read.new
87
+ read.name , read.orientation, read.from = attrs.split(" ")
88
+ contig.add_read read
89
+ end
90
+
91
+ # parse base sequence data
92
+ def parse_bs(contig, attrs)
93
+ from, to, read_name = attrs.split(" ")
94
+ read = contig.find_read_by_name( read_name )
95
+ read.add_base_sequence(from, to, read_name)
96
+ end
97
+
98
+ # parse read sequence and position data
99
+ def parse_rd(contig, attrs)
100
+ # increment counter
101
+ @num_rds_parsed += 1
102
+
103
+ # parse read
104
+ read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
105
+ seq = @file.gets("\n\n").tr( " \r\n", "")
106
+
107
+ # get read with matching name
108
+ read = contig.find_read_by_name( read_name )
109
+ read.seq = seq
110
+ read.to = read.from.to_i + read.seq.length
111
+ # set read.to to contig length if read runs off contig
112
+ read.to = contig.seq.length if read.to > contig.seq.length
113
+
114
+ # if present parse QA and DS associated with this read
115
+ each_identifier do |identifier, attrs|
116
+ case identifier
117
+ when "QA" then parse_qa(read, attrs)
118
+ when "DS" then parse_ds(read, attrs); break
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+ # parse a read's clear ranges (the part of the read that contributes to the contig)
125
+ def parse_qa(read, attrs)
126
+ start, stop, clear_range_from, clear_range_to = attrs.split(" ")
127
+ read.clear_range_from = clear_range_from
128
+ read.clear_range_to = clear_range_to
129
+ end
130
+
131
+ # parse file data - ignored
132
+ def parse_ds(read, attrs)
133
+ end
134
+
135
+ # parse run meta data - ignored
136
+ def parse_wa(contig, attrs)
137
+ end
138
+
139
+ # parse run meta data - ignored
140
+ def parse_ct(contig, attrs)
141
+ end
142
+
143
+ end # => end class Ace
144
+
145
+ # open contig class and write ace specific methods for contig objects
146
+ class Contig
147
+
148
+ def to_ace
149
+ ace = ""
150
+ ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
151
+ ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
152
+ ace += "BQ\n"
153
+ last_stop = quality.size - 1
154
+ (quality.size/50+1).times do |i|
155
+ start = i * 50
156
+ stop = (i+1) * 50 - 1
157
+ stop = last_stop if stop > last_stop
158
+ ace += ' ' + quality[start..stop].join(' ') + "\n"
159
+ end
160
+ ace += "\n"
161
+
162
+ # holds BS data for reads
163
+ bs_str = ""
164
+ # holds RD, QA, and DS data for reads
165
+ rest_str = ""
166
+ @reads.values.sort.each do |read|
167
+ ace += read.to_ace_af
168
+ bs_str += read.to_ace_bs
169
+ rest_str += read.to_ace_rest
170
+ end
171
+
172
+ # compile data in correct order
173
+ ace += bs_str
174
+ ace += "\n"
175
+ ace += rest_str
176
+ ace
177
+ end
178
+
179
+ end # => end Contig class
180
+
181
+ # open Read class to add ace specific methods for read objects
182
+ class Read
183
+
184
+ attr_accessor :base_sequences
185
+
186
+ def to_ace
187
+ ace += ""
188
+ # holds BS data for reads
189
+ bs_str = ""
190
+ # holds RD, QA, and DS data for reads
191
+ rest_str = ""
192
+ ace += to_ace_af
193
+ bs_str += to_ace_bs
194
+ rest_str = to_ace_rest
195
+
196
+ # compile data in correct order
197
+ ace += bs_str
198
+ ace += "\n"
199
+ ace += rest_str
200
+ ace
201
+ end
202
+
203
+ def to_ace_bs
204
+ bs_str = ""
205
+ unless base_sequences.nil?
206
+ base_sequences.each do |bs|
207
+ bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
208
+ end
209
+ end
210
+ bs_str
211
+ end
212
+
213
+ def to_ace_af
214
+ ['AF', name, orientation, from].join(' ') + "\n"
215
+ end
216
+
217
+ def to_ace_rest
218
+ rest_str = ""
219
+ rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
220
+ rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
221
+ rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
222
+ rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
223
+ rest_str
224
+ end
225
+
226
+ def add_base_sequence(from, to, read_name)
227
+ @base_sequences = Array.new if @base_sequences.nil?
228
+ @base_sequences.push BaseSequence.new(from, to, read_name)
229
+ end
230
+
231
+ class BaseSequence
232
+ attr_accessor :from, :to, :read_name
233
+
234
+ def initialize(from, to, read_name)
235
+ @from = from
236
+ @to = to
237
+ @read_name = read_name
238
+ end
239
+
240
+ def <=>(other)
241
+ unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
242
+ raise "[Error] markers are not comparable"
243
+ end
244
+ if self.from == other.from
245
+ # sort by to if froms are identical
246
+ return self.to.<=>(other.to)
247
+ else
248
+ return self.from.<=>(other.from)
249
+ end
250
+ end
251
+
252
+ end # => end BaseSequence Class
253
+
254
+ end # => end Read Class
255
+
256
+
257
+ end # => end class Assembly
258
+ end # => end module Bio
@@ -1,3 +1,5 @@
1
+ #require 'bio-assembly/contig/ace'
2
+
1
3
  module Bio
2
4
  class Assembly
3
5
 
@@ -59,39 +61,9 @@ module Bio
59
61
  end
60
62
  num_base_sequences
61
63
  end
62
-
63
- def to_ace
64
- ace = ""
65
- ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n"
66
- ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
67
- ace += "BQ\n"
68
- last_stop = quality.size - 1
69
- (quality.size/50+1).times do |i|
70
- start = i * 50
71
- stop = (i+1) * 50 - 1
72
- stop = last_stop if stop > last_stop
73
- ace += ' ' + quality[start..stop].join(' ') + "\n"
74
- end
75
- ace += "\n"
76
-
77
- # holds BS data for reads
78
- bs_str = ""
79
- # holds RD, QA, and DS data for reads
80
- rest_str = ""
81
- @reads.values.sort.each do |read|
82
- ace += read.to_ace_af
83
- bs_str += read.to_ace_bs
84
- rest_str += read.to_ace_rest
85
- end
86
-
87
- # compile data in correct order
88
- ace += bs_str
89
- ace += "\n"
90
- ace += rest_str
91
- ace
92
- end
93
64
 
94
65
  end
95
66
 
96
67
  end
97
- end
68
+ end
69
+
@@ -1,10 +1,9 @@
1
1
 
2
- require 'bio-assembly/read/ace'
2
+ #require 'bio-assembly/read/ace'
3
3
 
4
4
  module Bio
5
5
  class Assembly
6
6
  class Read
7
- include Bio::Assembly::Read::Ace
8
7
 
9
8
  attr_accessor :seq, :name, :orientation, :from, :to, :clear_range_from, :clear_range_to
10
9
  def initialize(str="")
@@ -34,24 +33,7 @@ module Bio
34
33
  def clear_range_to=(new_clear_range_to)
35
34
  @clear_range_to = new_clear_range_to.to_i
36
35
  end
37
-
38
- def to_ace
39
- ace += ""
40
- # holds BS data for reads
41
- bs_str = ""
42
- # holds RD, QA, and DS data for reads
43
- rest_str = ""
44
- ace += to_ace_af
45
- bs_str += to_ace_bs
46
- rest_str = to_ace_rest
47
-
48
- # compile data in correct order
49
- ace += bs_str
50
- ace += "\n"
51
- ace += rest_str
52
- ace
53
- end
54
-
36
+
55
37
  def <=>(other)
56
38
  unless other.kind_of?(Bio::Assembly::Read)
57
39
  raise "[Error] markers are not comparable"
@@ -64,29 +46,6 @@ module Bio
64
46
  end
65
47
  end
66
48
 
67
- def to_ace_bs
68
- bs_str = ""
69
- unless base_sequences.nil?
70
- base_sequences.each do |bs|
71
- bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n"
72
- end
73
- end
74
- bs_str
75
- end
76
-
77
- def to_ace_af
78
- ['AF', name, orientation, from].join(' ') + "\n"
79
- end
80
-
81
- def to_ace_rest
82
- rest_str = ""
83
- rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n"
84
- rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n"
85
- rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n"
86
- rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n"
87
- rest_str
88
- end
89
-
90
49
  end
91
50
 
92
51
  end
@@ -4,7 +4,7 @@ class TestBioAssembly < Test::Unit::TestCase
4
4
 
5
5
  def setup
6
6
  ace_filename = File.join('data', 'example1.ace')
7
- @obj = Bio::Assembly.new(ace_filename)
7
+ @obj = Bio::Assembly.create(ace_filename, :ace)
8
8
 
9
9
  # pick a contig to do in depth tests on
10
10
  @contig = nil
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-assembly
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 0
10
- version: 0.0.0
9
+ - 1
10
+ version: 0.0.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Chase Miller
@@ -114,9 +114,9 @@ files:
114
114
  - bio-assembly.gemspec
115
115
  - data/example1.ace
116
116
  - lib/bio-assembly.rb
117
+ - lib/bio-assembly/ace.rb
117
118
  - lib/bio-assembly/contig.rb
118
119
  - lib/bio-assembly/read.rb
119
- - lib/bio-assembly/read/ace.rb
120
120
  - test/helper.rb
121
121
  - test/test_bio-assembly.rb
122
122
  has_rdoc: true
@@ -1,39 +0,0 @@
1
- module Bio
2
- class Assembly
3
- class Read
4
-
5
- module Ace
6
- attr_accessor :base_sequences
7
-
8
- def add_base_sequence(from, to, read_name)
9
- @base_sequences = Array.new if @base_sequences.nil?
10
- @base_sequences.push BaseSequence.new(from, to, read_name)
11
- end
12
-
13
- class BaseSequence
14
- attr_accessor :from, :to, :read_name
15
-
16
- def initialize(from, to, read_name)
17
- @from = from
18
- @to = to
19
- @read_name = read_name
20
- end
21
-
22
- def <=>(other)
23
- unless other.kind_of?(Bio::Assembly::Read::BaseSequence)
24
- raise "[Error] markers are not comparable"
25
- end
26
- if self.from == other.from
27
- # sort by to if froms are identical
28
- return self.to.<=>(other.to)
29
- else
30
- return self.from.<=>(other.from)
31
- end
32
- end
33
-
34
- end
35
-
36
- end
37
- end
38
- end
39
- end