bio-gff3 0.8.4 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/Gemfile +4 -5
  2. data/Gemfile.lock +4 -0
  3. data/Rakefile +12 -10
  4. data/VERSION +1 -1
  5. data/bin/gff3-fetch +104 -47
  6. data/bio-gff3.gemspec +40 -12
  7. data/lib/bio-gff3.rb +29 -1
  8. data/lib/bio/db/gff/{gffinmemory.rb → digest/gffinmemory.rb} +15 -2
  9. data/lib/bio/db/gff/{gffnocache.rb → digest/gffnocache.rb} +2 -0
  10. data/lib/bio/db/gff/{gffparser.rb → digest/gffparser.rb} +40 -49
  11. data/lib/bio/db/gff/{gfffasta.rb → file/gfffasta.rb} +1 -1
  12. data/lib/bio/db/gff/{gfffileiterator.rb → file/gfffileiterator.rb} +3 -1
  13. data/lib/bio/db/gff/{gffdb.rb → gff3.rb} +6 -11
  14. data/lib/bio/db/gff/gff3fasta.rb +25 -0
  15. data/lib/bio/db/gff/gff3parsefile.rb +33 -0
  16. data/lib/bio/db/gff/gff3parserec.rb +63 -0
  17. data/lib/bio/db/gff/gffcomponent.rb +94 -0
  18. data/lib/bio/db/gff/gffrecord.rb +71 -0
  19. data/lib/bio/db/gff/gffsection.rb +41 -0
  20. data/lib/bio/db/gff/gffsequence.rb +160 -0
  21. data/lib/bio/db/gff/gffvalidate.rb +90 -0
  22. data/lib/bio/output/gfflogger.rb +33 -0
  23. data/spec/gff3_assemble2_spec.rb +3 -3
  24. data/spec/gff3_assemble3_spec.rb +4 -4
  25. data/spec/gff3_assemble_spec.rb +3 -3
  26. data/spec/gffdb_spec.rb +15 -15
  27. data/spec/gffparserec.rb +44 -0
  28. data/test/data/gff/test-cds.gff3 +3 -7
  29. data/test/data/regression/test_ext_gff3.rtest +61 -0
  30. data/test/data/regression/test_gff3.rtest +65 -0
  31. data/test/data/regression/test_nocache_ext_gff3.rtest +56 -0
  32. data/test/data/regression/test_nocache_gff3.rtest +65 -0
  33. data/test/regressiontest.rb +52 -0
  34. data/test/test_bio-gff3.rb +34 -4
  35. metadata +103 -29
  36. data/lib/bio/db/gff/gffassemble.rb +0 -341
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 8
8
- - 4
9
- version: 0.8.4
8
+ - 5
9
+ version: 0.8.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pjotr Prins
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-03 00:00:00 +01:00
17
+ date: 2011-01-13 00:00:00 +01:00
18
18
  default_executable: gff3-fetch
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -33,8 +33,23 @@ dependencies:
33
33
  prerelease: false
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
36
- name: shoulda
36
+ name: bio-logger
37
37
  requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">"
41
+ - !ruby/object:Gem::Version
42
+ segments:
43
+ - 0
44
+ - 5
45
+ - 0
46
+ version: 0.5.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: shoulda
52
+ requirement: &id003 !ruby/object:Gem::Requirement
38
53
  none: false
39
54
  requirements:
40
55
  - - ">="
@@ -44,10 +59,10 @@ dependencies:
44
59
  version: "0"
45
60
  type: :development
46
61
  prerelease: false
47
- version_requirements: *id002
62
+ version_requirements: *id003
48
63
  - !ruby/object:Gem::Dependency
49
64
  name: bundler
50
- requirement: &id003 !ruby/object:Gem::Requirement
65
+ requirement: &id004 !ruby/object:Gem::Requirement
51
66
  none: false
52
67
  requirements:
53
68
  - - ~>
@@ -59,10 +74,10 @@ dependencies:
59
74
  version: 1.0.0
60
75
  type: :development
61
76
  prerelease: false
62
- version_requirements: *id003
77
+ version_requirements: *id004
63
78
  - !ruby/object:Gem::Dependency
64
79
  name: jeweler
65
- requirement: &id004 !ruby/object:Gem::Requirement
80
+ requirement: &id005 !ruby/object:Gem::Requirement
66
81
  none: false
67
82
  requirements:
68
83
  - - ~>
@@ -74,10 +89,10 @@ dependencies:
74
89
  version: 1.5.2
75
90
  type: :development
76
91
  prerelease: false
77
- version_requirements: *id004
92
+ version_requirements: *id005
78
93
  - !ruby/object:Gem::Dependency
79
94
  name: rcov
80
- requirement: &id005 !ruby/object:Gem::Requirement
95
+ requirement: &id006 !ruby/object:Gem::Requirement
81
96
  none: false
82
97
  requirements:
83
98
  - - ">="
@@ -87,37 +102,81 @@ dependencies:
87
102
  version: "0"
88
103
  type: :development
89
104
  prerelease: false
90
- version_requirements: *id005
105
+ version_requirements: *id006
106
+ - !ruby/object:Gem::Dependency
107
+ name: rspec
108
+ requirement: &id007 !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ segments:
114
+ - 2
115
+ - 0
116
+ - 0
117
+ version: 2.0.0
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: *id007
91
121
  - !ruby/object:Gem::Dependency
92
122
  name: bio
93
- requirement: &id006 !ruby/object:Gem::Requirement
123
+ requirement: &id008 !ruby/object:Gem::Requirement
94
124
  none: false
95
125
  requirements:
96
126
  - - ">="
97
127
  - !ruby/object:Gem::Version
98
128
  segments:
99
129
  - 1
100
- - 3
130
+ - 4
101
131
  - 1
102
- version: 1.3.1
103
- type: :development
132
+ version: 1.4.1
133
+ type: :runtime
104
134
  prerelease: false
105
- version_requirements: *id006
135
+ version_requirements: *id008
106
136
  - !ruby/object:Gem::Dependency
107
- name: rspec
108
- requirement: &id007 !ruby/object:Gem::Requirement
137
+ name: log4r
138
+ requirement: &id009 !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ">"
142
+ - !ruby/object:Gem::Version
143
+ segments:
144
+ - 1
145
+ - 1
146
+ - 6
147
+ version: 1.1.6
148
+ type: :runtime
149
+ prerelease: false
150
+ version_requirements: *id009
151
+ - !ruby/object:Gem::Dependency
152
+ name: bio-logger
153
+ requirement: &id010 !ruby/object:Gem::Requirement
109
154
  none: false
110
155
  requirements:
111
156
  - - ">="
112
157
  - !ruby/object:Gem::Version
113
158
  segments:
114
- - 2
115
159
  - 0
160
+ - 6
161
+ - 1
162
+ version: 0.6.1
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: *id010
166
+ - !ruby/object:Gem::Dependency
167
+ name: rspec
168
+ requirement: &id011 !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ">"
172
+ - !ruby/object:Gem::Version
173
+ segments:
174
+ - 2
116
175
  - 0
117
- version: 2.0.0
176
+ version: "2.0"
118
177
  type: :development
119
178
  prerelease: false
120
- version_requirements: *id007
179
+ version_requirements: *id011
121
180
  description: |
122
181
  GFF3 (genome browser) information and digest mRNA and CDS sequences.
123
182
  Options for low memory use and caching of records.
@@ -141,20 +200,29 @@ files:
141
200
  - bin/gff3-fetch
142
201
  - bio-gff3.gemspec
143
202
  - lib/bio-gff3.rb
144
- - lib/bio/db/gff/gffassemble.rb
145
- - lib/bio/db/gff/gffdb.rb
146
- - lib/bio/db/gff/gfffasta.rb
147
- - lib/bio/db/gff/gfffileiterator.rb
148
- - lib/bio/db/gff/gffinmemory.rb
149
- - lib/bio/db/gff/gffnocache.rb
150
- - lib/bio/db/gff/gffparser.rb
203
+ - lib/bio/db/gff/digest/gffinmemory.rb
204
+ - lib/bio/db/gff/digest/gffnocache.rb
205
+ - lib/bio/db/gff/digest/gffparser.rb
206
+ - lib/bio/db/gff/file/gfffasta.rb
207
+ - lib/bio/db/gff/file/gfffileiterator.rb
208
+ - lib/bio/db/gff/gff3.rb
209
+ - lib/bio/db/gff/gff3fasta.rb
210
+ - lib/bio/db/gff/gff3parsefile.rb
211
+ - lib/bio/db/gff/gff3parserec.rb
212
+ - lib/bio/db/gff/gffcomponent.rb
213
+ - lib/bio/db/gff/gffrecord.rb
214
+ - lib/bio/db/gff/gffsection.rb
215
+ - lib/bio/db/gff/gffsequence.rb
216
+ - lib/bio/db/gff/gffvalidate.rb
151
217
  - lib/bio/output/gfffastawriter.rb
218
+ - lib/bio/output/gfflogger.rb
152
219
  - lib/bio/system/lruhash.rb
153
220
  - spec/gff3_assemble2_spec.rb
154
221
  - spec/gff3_assemble3_spec.rb
155
222
  - spec/gff3_assemble_spec.rb
156
223
  - spec/gff3_fileiterator_spec.rb
157
224
  - spec/gffdb_spec.rb
225
+ - spec/gffparserec.rb
158
226
  - test/data/gff/MhA1_Contig1133.fa
159
227
  - test/data/gff/MhA1_Contig1133.gff3
160
228
  - test/data/gff/MhA1_Contig125.fa
@@ -164,7 +232,12 @@ files:
164
232
  - test/data/gff/test-ext-fasta.fa
165
233
  - test/data/gff/test-ext-fasta.gff3
166
234
  - test/data/gff/test.gff3
235
+ - test/data/regression/test_ext_gff3.rtest
236
+ - test/data/regression/test_gff3.rtest
237
+ - test/data/regression/test_nocache_ext_gff3.rtest
238
+ - test/data/regression/test_nocache_gff3.rtest
167
239
  - test/helper.rb
240
+ - test/regressiontest.rb
168
241
  - test/test_bio-gff3.rb
169
242
  has_rdoc: true
170
243
  homepage: http://github.com/pjotrp/bioruby-gff3
@@ -180,7 +253,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
253
  requirements:
181
254
  - - ">="
182
255
  - !ruby/object:Gem::Version
183
- hash: 520620943
184
256
  segments:
185
257
  - 0
186
258
  version: "0"
@@ -205,5 +277,7 @@ test_files:
205
277
  - spec/gff3_assemble_spec.rb
206
278
  - spec/gff3_fileiterator_spec.rb
207
279
  - spec/gffdb_spec.rb
280
+ - spec/gffparserec.rb
208
281
  - test/helper.rb
282
+ - test/regressiontest.rb
209
283
  - test/test_bio-gff3.rb
@@ -1,341 +0,0 @@
1
- #
2
- # = bio/db/gff/gffassemble.rb - Assemble mRNA and CDS from GFF
3
- #
4
- # Copyright:: Copyright (C) 2010
5
- # Pjotr Prins <pjotr.prins@thebird.nl>
6
- # License:: The Ruby License
7
- #
8
- # Fetch information from a GFF file
9
-
10
- module Bio
11
- module GFFbrowser
12
-
13
- module Helpers
14
-
15
- module Error
16
- def info str, id=''
17
- $stderr.print "Info: "+str+" <#{id}>\n"
18
- end
19
-
20
- def warn str, id=''
21
- Kernel.warn "Warning: "+str+" <#{id}>"
22
- end
23
-
24
- def error str, id=''
25
- Kernel.warn "Error: "+str+" <#{id}>"
26
- exit(1) if $stop_on_error
27
- end
28
- end
29
-
30
- # Helper class for counting IDs
31
- class Counter < Hash
32
- def add id
33
- self[id] = 0 if self[id] == nil
34
- self[id] += 1
35
- end
36
- end
37
-
38
- # Helper class for storing linked records based on a shared ID
39
- class LinkedRecs < Hash
40
- include Error
41
- def add id, rec
42
- info "Adding #{rec.feature_type} <#{id}>"
43
- self[id] = [] if self[id] == nil
44
- self[id] << rec
45
- end
46
-
47
- # Validate all lists belong to the same container/component
48
- def validate_seqname
49
- each do | id, rec |
50
- seqname = rec.first.seqname
51
- rec.each do | section |
52
- raise "Non-matching seqname #{section.seqname} in #{seqname}" if section.seqname != seqname
53
- end
54
- end
55
- end
56
-
57
- # Validate all lists share the same parent (if available). First checks
58
- # for Parent attribute, next for mRNA attribute
59
- def validate_shared_parent
60
- each do | id, rec |
61
- parent = rec.first.get_attribute('Parent')
62
- if parent
63
- rec.each do | section |
64
- _parent = section.get_attribute('Parent')
65
- raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
66
- end
67
- end
68
- parent = rec.first.get_attribute('mRNA')
69
- if parent
70
- rec.each do | section |
71
- _parent = section.get_attribute('mRNA')
72
- raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
73
- end
74
- end
75
- end
76
- end
77
-
78
- # walk all (CDS) lists for every container/component and
79
- # validate they do not overlap
80
- def validate_nonoverlapping
81
- each do | id, rec |
82
- sections = Sections::sort(rec)
83
- sections.each_with_index do | check, i |
84
- neighbour = sections[i+1]
85
- if neighbour and check.intersection(neighbour)
86
- warn "Overlapping sections for ",id
87
- end
88
- end
89
- end
90
- end
91
- end
92
-
93
- class Section < Range
94
- attr_reader :rec
95
- def initialize rec
96
- super(rec.start,rec.end)
97
- @rec = rec
98
- end
99
- def intersection(other)
100
- raise ArgumentError, 'value must be a Range' unless other.kind_of?(Range)
101
- min, max = first, exclude_end? ? max : last
102
- other_min, other_max = other.first, other.exclude_end? ? other.max : other.last
103
- new_min = self === other_min ? other_min : other === min ? min : nil
104
- new_max = self === other_max ? other_max : other === max ? max : nil
105
- new_min && new_max ? new_min..new_max : nil
106
- end
107
- alias_method :&, :intersection
108
- def <=> other
109
- first <=> other.first
110
- end
111
- end
112
-
113
- module Sections
114
- # Return list of sorted Sections
115
- def Sections::sort rec
116
- sections = []
117
- rec.each do | section |
118
- sections.push Section.new(section)
119
- end
120
- sections.sort
121
- end
122
- end
123
-
124
- module Record
125
- include Error
126
- # Format a record ID by, first, getting the ID attribute. If that fails
127
- # the seqname is used with the start/stop positions.
128
- def Record::formatID rec
129
- id = rec.id if rec.id
130
- if !id
131
- if rec.seqname
132
- id = "#{rec.seqname} #{rec.start} #{rec.end}".strip
133
- else
134
- id = 'unknown'
135
- $stderr.print "Record with unknown ID"+rec.to_s
136
- end
137
- end
138
- id
139
- end
140
- end
141
-
142
- module Gff3Component
143
-
144
- include Error
145
-
146
- COMPONENT_TYPES = %w{
147
- gene SO:0000704 contig transcript Component region
148
- }
149
-
150
- # Walk the component list to find a matching component/container for a
151
- # record. First use the parent ID. If that is missing go by sequence
152
- # name.
153
- def find_component rec
154
- parent = rec.get_attribute('Parent')
155
- if @componentlist[parent]
156
- # nice, there is a match
157
- info "find_component: Matched parent", parent
158
- return @componentlist[parent]
159
- end
160
- search = rec.seqname
161
- if @componentlist[search]
162
- info "find_component: Matched seqname", search
163
- return @componentlist[search]
164
- end
165
- @componentlist.each do | componentid, component |
166
- # dissemble id
167
- (id, start, stop) = componentid.split(/ /)
168
- if id==search and rec.start >= start.to_i and rec.end <= stop.to_i
169
- info "find_component: Matched column 0 and location", componentid
170
- return component
171
- end
172
- end
173
- # Ah, painful. At this point the record has no matching container, probably
174
- # because it has no parent ID and the component has an ID. We have to go by
175
- # ID for every component individually
176
- @componentlist.each do | componentid, component |
177
- if component.seqname==search and rec.start >= component.start and rec.end <= component.end
178
- # p ["----",search,rec]
179
- # p component
180
- info "find_component: Matched (long search) column 0 and location", componentid
181
- return component
182
- end
183
- end
184
- warn "Could not find container/component for",Record::formatID(rec)
185
- end
186
- end
187
-
188
- module Gff3Features
189
-
190
- # Ignore the following features (case sensitive?)
191
- IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
192
- transposon Match similarity UTR
193
- TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
194
- polyA_site SO:0000553
195
- five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
196
- exon SO:0000147
197
- }
198
- end
199
-
200
- module Gff3Sequence
201
- # Patch a sequence together from a Sequence string and an array
202
- # of records. Note that rec positions are 1-based coordinates, relative
203
- # to the landmark given in column 1 - in this case the sequence as it
204
- # is passed in. The following options are available:
205
- #
206
- # :reverse : do reverse if reverse is indicated (default true)
207
- # :complement : do complement if reverse is indicated (default true)
208
- # :phase : do set CDS phase (default false, normally ignore)
209
- # :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
210
- #
211
- # special options:
212
- #
213
- # :raw : raw sequence (all above false)
214
- # :codonize : codon sequence (reverse, complement, and trim are true)
215
- # :fix : fix errors (default false)
216
- #
217
- def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
218
- # default to nil, if not passed in
219
- do_debug = options[:debug]
220
- do_phase = options[:phase]
221
- do_fix = options[:fix]
222
- # default to true, if not passed in
223
- do_reverse = (options[:reverse] == false ? false : true)
224
- do_trim = (options[:trim] == false ? false : true)
225
- do_complement = (options[:complement] == false ? false : true)
226
- if options[:raw]
227
- do_phase = false
228
- do_reverse = false
229
- do_trim = false
230
- do_complement = false
231
- elsif options[:codonize]
232
- do_phase = false
233
- do_reverse = true
234
- do_trim = true
235
- do_complement = true
236
- end
237
- sectionlist = Sections::sort(reclist)
238
- rec0 = sectionlist.first.rec
239
- # we assume ORF is always read in the same direction
240
- orf_reverse = (rec0.strand == '-')
241
- orf_frame = startpos - 1
242
- orf_frameshift = orf_frame % 3
243
- sectionlist = sectionlist.reverse if orf_reverse
244
- if do_debug
245
- p "------------------"
246
- p options
247
- p [:reverse,do_reverse]
248
- p [:complement,do_complement]
249
- p [:trim,do_trim]
250
- p [:orf_reverse, orf_reverse, rec0.strand]
251
- end
252
-
253
- if sequence.kind_of?(Bio::FastaFormat)
254
- # BioRuby conversion
255
- sequence = sequence.seq
256
- end
257
- # Generate array of sequences
258
- seq = sectionlist.map { | section |
259
- rec = section.rec
260
- s = sequence[(section.begin-1)..(section.end-1)]
261
- if do_reverse and orf_reverse
262
- s = s.reverse
263
- end
264
- # Correct for phase. Unfortunately the use of phase is ambiguous.
265
- # Here we check whether rec.start is in line with orf_frame. If it
266
- # is, we correct for phase. Otherwise it is ignored.
267
- if do_phase and rec.phase
268
- phase = rec.phase.to_i
269
- # if ((rec.start-startpos) % 3 == 0)
270
- s = s[phase..-1]
271
- # end
272
- end
273
- s
274
- }
275
- # p seq
276
- seq = seq.join
277
- if do_complement and do_reverse and orf_reverse
278
- ntseq = Bio::Sequence::NA.new(seq)
279
- seq = ntseq.forward_complement.upcase
280
- end
281
- # This is the place to fix sequences (e.g. the Wormbase bug)
282
- if do_fix or @options[:fix] or @options[:fix_wormbase]
283
- if @options[:fix_wormbase] and rec0.id.index('gene1')==0
284
- # Wormbase gene1 only, so ignore rest
285
- else
286
- test_frame = 0
287
- ntseq = Bio::Sequence::NA.new(seq)
288
- aaseq = ntseq.translate
289
- if aaseq.count('*') > 1
290
- test_frame = 1
291
- seq = seq[1..-1]
292
- ntseq = Bio::Sequence::NA.new(seq)
293
- aaseq = ntseq.translate
294
- if aaseq.count('*') > 1
295
- test_frame = 2
296
- seq = seq[1..-1]
297
- ntseq = Bio::Sequence::NA.new(seq)
298
- aaseq = ntseq.translate
299
- raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
300
- end
301
- end
302
- if test_frame > 0
303
- warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
304
- end
305
- end
306
- end
307
- if do_trim
308
- reduce = seq.size % 3
309
- seq = seq[0..(seq.size-1-reduce)] if reduce != 0
310
- end
311
- if @options[:validate]
312
- ntseq = Bio::Sequence::NA.new(seq)
313
- aaseq = ntseq.translate
314
- raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
315
- end
316
-
317
- retval = seq
318
- retval
319
- end
320
-
321
- # Patch a sequence together from a Sequence string and an array
322
- # of records and translate in the correct direction and frame. The options
323
- # are the same as for +assemble+, except :trim defaults to true.
324
- def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
325
- seq = assemble(sequence, startpos, reclist, options)
326
- ntseq = Bio::Sequence::NA.new(seq)
327
- ntseq.translate
328
- end
329
-
330
- # Create a description for output
331
- def description id, component, rec
332
- sections = Sections::sort(rec)
333
- id+' Sequence:'+component.seqname+"_#{component.start}:#{component.end} ("+
334
- sections.map { |s| "#{s.first}:#{s.last}" }.join(', ') +")"
335
- end
336
-
337
- end
338
- end # Helpers
339
-
340
- end
341
- end