bio-gff3 0.8.4 → 0.8.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -5
- data/Gemfile.lock +4 -0
- data/Rakefile +12 -10
- data/VERSION +1 -1
- data/bin/gff3-fetch +104 -47
- data/bio-gff3.gemspec +40 -12
- data/lib/bio-gff3.rb +29 -1
- data/lib/bio/db/gff/{gffinmemory.rb → digest/gffinmemory.rb} +15 -2
- data/lib/bio/db/gff/{gffnocache.rb → digest/gffnocache.rb} +2 -0
- data/lib/bio/db/gff/{gffparser.rb → digest/gffparser.rb} +40 -49
- data/lib/bio/db/gff/{gfffasta.rb → file/gfffasta.rb} +1 -1
- data/lib/bio/db/gff/{gfffileiterator.rb → file/gfffileiterator.rb} +3 -1
- data/lib/bio/db/gff/{gffdb.rb → gff3.rb} +6 -11
- data/lib/bio/db/gff/gff3fasta.rb +25 -0
- data/lib/bio/db/gff/gff3parsefile.rb +33 -0
- data/lib/bio/db/gff/gff3parserec.rb +63 -0
- data/lib/bio/db/gff/gffcomponent.rb +94 -0
- data/lib/bio/db/gff/gffrecord.rb +71 -0
- data/lib/bio/db/gff/gffsection.rb +41 -0
- data/lib/bio/db/gff/gffsequence.rb +160 -0
- data/lib/bio/db/gff/gffvalidate.rb +90 -0
- data/lib/bio/output/gfflogger.rb +33 -0
- data/spec/gff3_assemble2_spec.rb +3 -3
- data/spec/gff3_assemble3_spec.rb +4 -4
- data/spec/gff3_assemble_spec.rb +3 -3
- data/spec/gffdb_spec.rb +15 -15
- data/spec/gffparserec.rb +44 -0
- data/test/data/gff/test-cds.gff3 +3 -7
- data/test/data/regression/test_ext_gff3.rtest +61 -0
- data/test/data/regression/test_gff3.rtest +65 -0
- data/test/data/regression/test_nocache_ext_gff3.rtest +56 -0
- data/test/data/regression/test_nocache_gff3.rtest +65 -0
- data/test/regressiontest.rb +52 -0
- data/test/test_bio-gff3.rb +34 -4
- metadata +103 -29
- data/lib/bio/db/gff/gffassemble.rb +0 -341
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 8
|
8
|
-
-
|
9
|
-
version: 0.8.
|
8
|
+
- 5
|
9
|
+
version: 0.8.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pjotr Prins
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-13 00:00:00 +01:00
|
18
18
|
default_executable: gff3-fetch
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -33,8 +33,23 @@ dependencies:
|
|
33
33
|
prerelease: false
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
36
|
+
name: bio-logger
|
37
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 5
|
45
|
+
- 0
|
46
|
+
version: 0.5.0
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: shoulda
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
38
53
|
none: false
|
39
54
|
requirements:
|
40
55
|
- - ">="
|
@@ -44,10 +59,10 @@ dependencies:
|
|
44
59
|
version: "0"
|
45
60
|
type: :development
|
46
61
|
prerelease: false
|
47
|
-
version_requirements: *
|
62
|
+
version_requirements: *id003
|
48
63
|
- !ruby/object:Gem::Dependency
|
49
64
|
name: bundler
|
50
|
-
requirement: &
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
66
|
none: false
|
52
67
|
requirements:
|
53
68
|
- - ~>
|
@@ -59,10 +74,10 @@ dependencies:
|
|
59
74
|
version: 1.0.0
|
60
75
|
type: :development
|
61
76
|
prerelease: false
|
62
|
-
version_requirements: *
|
77
|
+
version_requirements: *id004
|
63
78
|
- !ruby/object:Gem::Dependency
|
64
79
|
name: jeweler
|
65
|
-
requirement: &
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
66
81
|
none: false
|
67
82
|
requirements:
|
68
83
|
- - ~>
|
@@ -74,10 +89,10 @@ dependencies:
|
|
74
89
|
version: 1.5.2
|
75
90
|
type: :development
|
76
91
|
prerelease: false
|
77
|
-
version_requirements: *
|
92
|
+
version_requirements: *id005
|
78
93
|
- !ruby/object:Gem::Dependency
|
79
94
|
name: rcov
|
80
|
-
requirement: &
|
95
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
81
96
|
none: false
|
82
97
|
requirements:
|
83
98
|
- - ">="
|
@@ -87,37 +102,81 @@ dependencies:
|
|
87
102
|
version: "0"
|
88
103
|
type: :development
|
89
104
|
prerelease: false
|
90
|
-
version_requirements: *
|
105
|
+
version_requirements: *id006
|
106
|
+
- !ruby/object:Gem::Dependency
|
107
|
+
name: rspec
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ">="
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
segments:
|
114
|
+
- 2
|
115
|
+
- 0
|
116
|
+
- 0
|
117
|
+
version: 2.0.0
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: *id007
|
91
121
|
- !ruby/object:Gem::Dependency
|
92
122
|
name: bio
|
93
|
-
requirement: &
|
123
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
94
124
|
none: false
|
95
125
|
requirements:
|
96
126
|
- - ">="
|
97
127
|
- !ruby/object:Gem::Version
|
98
128
|
segments:
|
99
129
|
- 1
|
100
|
-
-
|
130
|
+
- 4
|
101
131
|
- 1
|
102
|
-
version: 1.
|
103
|
-
type: :
|
132
|
+
version: 1.4.1
|
133
|
+
type: :runtime
|
104
134
|
prerelease: false
|
105
|
-
version_requirements: *
|
135
|
+
version_requirements: *id008
|
106
136
|
- !ruby/object:Gem::Dependency
|
107
|
-
name:
|
108
|
-
requirement: &
|
137
|
+
name: log4r
|
138
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ">"
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
segments:
|
144
|
+
- 1
|
145
|
+
- 1
|
146
|
+
- 6
|
147
|
+
version: 1.1.6
|
148
|
+
type: :runtime
|
149
|
+
prerelease: false
|
150
|
+
version_requirements: *id009
|
151
|
+
- !ruby/object:Gem::Dependency
|
152
|
+
name: bio-logger
|
153
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
109
154
|
none: false
|
110
155
|
requirements:
|
111
156
|
- - ">="
|
112
157
|
- !ruby/object:Gem::Version
|
113
158
|
segments:
|
114
|
-
- 2
|
115
159
|
- 0
|
160
|
+
- 6
|
161
|
+
- 1
|
162
|
+
version: 0.6.1
|
163
|
+
type: :runtime
|
164
|
+
prerelease: false
|
165
|
+
version_requirements: *id010
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: rspec
|
168
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ">"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
segments:
|
174
|
+
- 2
|
116
175
|
- 0
|
117
|
-
version: 2.0
|
176
|
+
version: "2.0"
|
118
177
|
type: :development
|
119
178
|
prerelease: false
|
120
|
-
version_requirements: *
|
179
|
+
version_requirements: *id011
|
121
180
|
description: |
|
122
181
|
GFF3 (genome browser) information and digest mRNA and CDS sequences.
|
123
182
|
Options for low memory use and caching of records.
|
@@ -141,20 +200,29 @@ files:
|
|
141
200
|
- bin/gff3-fetch
|
142
201
|
- bio-gff3.gemspec
|
143
202
|
- lib/bio-gff3.rb
|
144
|
-
- lib/bio/db/gff/
|
145
|
-
- lib/bio/db/gff/
|
146
|
-
- lib/bio/db/gff/
|
147
|
-
- lib/bio/db/gff/
|
148
|
-
- lib/bio/db/gff/
|
149
|
-
- lib/bio/db/gff/
|
150
|
-
- lib/bio/db/gff/
|
203
|
+
- lib/bio/db/gff/digest/gffinmemory.rb
|
204
|
+
- lib/bio/db/gff/digest/gffnocache.rb
|
205
|
+
- lib/bio/db/gff/digest/gffparser.rb
|
206
|
+
- lib/bio/db/gff/file/gfffasta.rb
|
207
|
+
- lib/bio/db/gff/file/gfffileiterator.rb
|
208
|
+
- lib/bio/db/gff/gff3.rb
|
209
|
+
- lib/bio/db/gff/gff3fasta.rb
|
210
|
+
- lib/bio/db/gff/gff3parsefile.rb
|
211
|
+
- lib/bio/db/gff/gff3parserec.rb
|
212
|
+
- lib/bio/db/gff/gffcomponent.rb
|
213
|
+
- lib/bio/db/gff/gffrecord.rb
|
214
|
+
- lib/bio/db/gff/gffsection.rb
|
215
|
+
- lib/bio/db/gff/gffsequence.rb
|
216
|
+
- lib/bio/db/gff/gffvalidate.rb
|
151
217
|
- lib/bio/output/gfffastawriter.rb
|
218
|
+
- lib/bio/output/gfflogger.rb
|
152
219
|
- lib/bio/system/lruhash.rb
|
153
220
|
- spec/gff3_assemble2_spec.rb
|
154
221
|
- spec/gff3_assemble3_spec.rb
|
155
222
|
- spec/gff3_assemble_spec.rb
|
156
223
|
- spec/gff3_fileiterator_spec.rb
|
157
224
|
- spec/gffdb_spec.rb
|
225
|
+
- spec/gffparserec.rb
|
158
226
|
- test/data/gff/MhA1_Contig1133.fa
|
159
227
|
- test/data/gff/MhA1_Contig1133.gff3
|
160
228
|
- test/data/gff/MhA1_Contig125.fa
|
@@ -164,7 +232,12 @@ files:
|
|
164
232
|
- test/data/gff/test-ext-fasta.fa
|
165
233
|
- test/data/gff/test-ext-fasta.gff3
|
166
234
|
- test/data/gff/test.gff3
|
235
|
+
- test/data/regression/test_ext_gff3.rtest
|
236
|
+
- test/data/regression/test_gff3.rtest
|
237
|
+
- test/data/regression/test_nocache_ext_gff3.rtest
|
238
|
+
- test/data/regression/test_nocache_gff3.rtest
|
167
239
|
- test/helper.rb
|
240
|
+
- test/regressiontest.rb
|
168
241
|
- test/test_bio-gff3.rb
|
169
242
|
has_rdoc: true
|
170
243
|
homepage: http://github.com/pjotrp/bioruby-gff3
|
@@ -180,7 +253,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
253
|
requirements:
|
181
254
|
- - ">="
|
182
255
|
- !ruby/object:Gem::Version
|
183
|
-
hash: 520620943
|
184
256
|
segments:
|
185
257
|
- 0
|
186
258
|
version: "0"
|
@@ -205,5 +277,7 @@ test_files:
|
|
205
277
|
- spec/gff3_assemble_spec.rb
|
206
278
|
- spec/gff3_fileiterator_spec.rb
|
207
279
|
- spec/gffdb_spec.rb
|
280
|
+
- spec/gffparserec.rb
|
208
281
|
- test/helper.rb
|
282
|
+
- test/regressiontest.rb
|
209
283
|
- test/test_bio-gff3.rb
|
@@ -1,341 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# = bio/db/gff/gffassemble.rb - Assemble mRNA and CDS from GFF
|
3
|
-
#
|
4
|
-
# Copyright:: Copyright (C) 2010
|
5
|
-
# Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
-
# License:: The Ruby License
|
7
|
-
#
|
8
|
-
# Fetch information from a GFF file
|
9
|
-
|
10
|
-
module Bio
|
11
|
-
module GFFbrowser
|
12
|
-
|
13
|
-
module Helpers
|
14
|
-
|
15
|
-
module Error
|
16
|
-
def info str, id=''
|
17
|
-
$stderr.print "Info: "+str+" <#{id}>\n"
|
18
|
-
end
|
19
|
-
|
20
|
-
def warn str, id=''
|
21
|
-
Kernel.warn "Warning: "+str+" <#{id}>"
|
22
|
-
end
|
23
|
-
|
24
|
-
def error str, id=''
|
25
|
-
Kernel.warn "Error: "+str+" <#{id}>"
|
26
|
-
exit(1) if $stop_on_error
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
# Helper class for counting IDs
|
31
|
-
class Counter < Hash
|
32
|
-
def add id
|
33
|
-
self[id] = 0 if self[id] == nil
|
34
|
-
self[id] += 1
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Helper class for storing linked records based on a shared ID
|
39
|
-
class LinkedRecs < Hash
|
40
|
-
include Error
|
41
|
-
def add id, rec
|
42
|
-
info "Adding #{rec.feature_type} <#{id}>"
|
43
|
-
self[id] = [] if self[id] == nil
|
44
|
-
self[id] << rec
|
45
|
-
end
|
46
|
-
|
47
|
-
# Validate all lists belong to the same container/component
|
48
|
-
def validate_seqname
|
49
|
-
each do | id, rec |
|
50
|
-
seqname = rec.first.seqname
|
51
|
-
rec.each do | section |
|
52
|
-
raise "Non-matching seqname #{section.seqname} in #{seqname}" if section.seqname != seqname
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Validate all lists share the same parent (if available). First checks
|
58
|
-
# for Parent attribute, next for mRNA attribute
|
59
|
-
def validate_shared_parent
|
60
|
-
each do | id, rec |
|
61
|
-
parent = rec.first.get_attribute('Parent')
|
62
|
-
if parent
|
63
|
-
rec.each do | section |
|
64
|
-
_parent = section.get_attribute('Parent')
|
65
|
-
raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
|
66
|
-
end
|
67
|
-
end
|
68
|
-
parent = rec.first.get_attribute('mRNA')
|
69
|
-
if parent
|
70
|
-
rec.each do | section |
|
71
|
-
_parent = section.get_attribute('mRNA')
|
72
|
-
raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# walk all (CDS) lists for every container/component and
|
79
|
-
# validate they do not overlap
|
80
|
-
def validate_nonoverlapping
|
81
|
-
each do | id, rec |
|
82
|
-
sections = Sections::sort(rec)
|
83
|
-
sections.each_with_index do | check, i |
|
84
|
-
neighbour = sections[i+1]
|
85
|
-
if neighbour and check.intersection(neighbour)
|
86
|
-
warn "Overlapping sections for ",id
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
class Section < Range
|
94
|
-
attr_reader :rec
|
95
|
-
def initialize rec
|
96
|
-
super(rec.start,rec.end)
|
97
|
-
@rec = rec
|
98
|
-
end
|
99
|
-
def intersection(other)
|
100
|
-
raise ArgumentError, 'value must be a Range' unless other.kind_of?(Range)
|
101
|
-
min, max = first, exclude_end? ? max : last
|
102
|
-
other_min, other_max = other.first, other.exclude_end? ? other.max : other.last
|
103
|
-
new_min = self === other_min ? other_min : other === min ? min : nil
|
104
|
-
new_max = self === other_max ? other_max : other === max ? max : nil
|
105
|
-
new_min && new_max ? new_min..new_max : nil
|
106
|
-
end
|
107
|
-
alias_method :&, :intersection
|
108
|
-
def <=> other
|
109
|
-
first <=> other.first
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
module Sections
|
114
|
-
# Return list of sorted Sections
|
115
|
-
def Sections::sort rec
|
116
|
-
sections = []
|
117
|
-
rec.each do | section |
|
118
|
-
sections.push Section.new(section)
|
119
|
-
end
|
120
|
-
sections.sort
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
module Record
|
125
|
-
include Error
|
126
|
-
# Format a record ID by, first, getting the ID attribute. If that fails
|
127
|
-
# the seqname is used with the start/stop positions.
|
128
|
-
def Record::formatID rec
|
129
|
-
id = rec.id if rec.id
|
130
|
-
if !id
|
131
|
-
if rec.seqname
|
132
|
-
id = "#{rec.seqname} #{rec.start} #{rec.end}".strip
|
133
|
-
else
|
134
|
-
id = 'unknown'
|
135
|
-
$stderr.print "Record with unknown ID"+rec.to_s
|
136
|
-
end
|
137
|
-
end
|
138
|
-
id
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
module Gff3Component
|
143
|
-
|
144
|
-
include Error
|
145
|
-
|
146
|
-
COMPONENT_TYPES = %w{
|
147
|
-
gene SO:0000704 contig transcript Component region
|
148
|
-
}
|
149
|
-
|
150
|
-
# Walk the component list to find a matching component/container for a
|
151
|
-
# record. First use the parent ID. If that is missing go by sequence
|
152
|
-
# name.
|
153
|
-
def find_component rec
|
154
|
-
parent = rec.get_attribute('Parent')
|
155
|
-
if @componentlist[parent]
|
156
|
-
# nice, there is a match
|
157
|
-
info "find_component: Matched parent", parent
|
158
|
-
return @componentlist[parent]
|
159
|
-
end
|
160
|
-
search = rec.seqname
|
161
|
-
if @componentlist[search]
|
162
|
-
info "find_component: Matched seqname", search
|
163
|
-
return @componentlist[search]
|
164
|
-
end
|
165
|
-
@componentlist.each do | componentid, component |
|
166
|
-
# dissemble id
|
167
|
-
(id, start, stop) = componentid.split(/ /)
|
168
|
-
if id==search and rec.start >= start.to_i and rec.end <= stop.to_i
|
169
|
-
info "find_component: Matched column 0 and location", componentid
|
170
|
-
return component
|
171
|
-
end
|
172
|
-
end
|
173
|
-
# Ah, painful. At this point the record has no matching container, probably
|
174
|
-
# because it has no parent ID and the component has an ID. We have to go by
|
175
|
-
# ID for every component individually
|
176
|
-
@componentlist.each do | componentid, component |
|
177
|
-
if component.seqname==search and rec.start >= component.start and rec.end <= component.end
|
178
|
-
# p ["----",search,rec]
|
179
|
-
# p component
|
180
|
-
info "find_component: Matched (long search) column 0 and location", componentid
|
181
|
-
return component
|
182
|
-
end
|
183
|
-
end
|
184
|
-
warn "Could not find container/component for",Record::formatID(rec)
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
module Gff3Features
|
189
|
-
|
190
|
-
# Ignore the following features (case sensitive?)
|
191
|
-
IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{
|
192
|
-
transposon Match similarity UTR
|
193
|
-
TF_binding_site intronSO:0000188 polyA_sequence SO:0000610
|
194
|
-
polyA_site SO:0000553
|
195
|
-
five_prime_UTR SO:0000204 three_prime_UTR SO:0000205
|
196
|
-
exon SO:0000147
|
197
|
-
}
|
198
|
-
end
|
199
|
-
|
200
|
-
module Gff3Sequence
|
201
|
-
# Patch a sequence together from a Sequence string and an array
|
202
|
-
# of records. Note that rec positions are 1-based coordinates, relative
|
203
|
-
# to the landmark given in column 1 - in this case the sequence as it
|
204
|
-
# is passed in. The following options are available:
|
205
|
-
#
|
206
|
-
# :reverse : do reverse if reverse is indicated (default true)
|
207
|
-
# :complement : do complement if reverse is indicated (default true)
|
208
|
-
# :phase : do set CDS phase (default false, normally ignore)
|
209
|
-
# :trim : make sure sequence is multiple of 3 nucleotide bps (default true)
|
210
|
-
#
|
211
|
-
# special options:
|
212
|
-
#
|
213
|
-
# :raw : raw sequence (all above false)
|
214
|
-
# :codonize : codon sequence (reverse, complement, and trim are true)
|
215
|
-
# :fix : fix errors (default false)
|
216
|
-
#
|
217
|
-
def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false }
|
218
|
-
# default to nil, if not passed in
|
219
|
-
do_debug = options[:debug]
|
220
|
-
do_phase = options[:phase]
|
221
|
-
do_fix = options[:fix]
|
222
|
-
# default to true, if not passed in
|
223
|
-
do_reverse = (options[:reverse] == false ? false : true)
|
224
|
-
do_trim = (options[:trim] == false ? false : true)
|
225
|
-
do_complement = (options[:complement] == false ? false : true)
|
226
|
-
if options[:raw]
|
227
|
-
do_phase = false
|
228
|
-
do_reverse = false
|
229
|
-
do_trim = false
|
230
|
-
do_complement = false
|
231
|
-
elsif options[:codonize]
|
232
|
-
do_phase = false
|
233
|
-
do_reverse = true
|
234
|
-
do_trim = true
|
235
|
-
do_complement = true
|
236
|
-
end
|
237
|
-
sectionlist = Sections::sort(reclist)
|
238
|
-
rec0 = sectionlist.first.rec
|
239
|
-
# we assume ORF is always read in the same direction
|
240
|
-
orf_reverse = (rec0.strand == '-')
|
241
|
-
orf_frame = startpos - 1
|
242
|
-
orf_frameshift = orf_frame % 3
|
243
|
-
sectionlist = sectionlist.reverse if orf_reverse
|
244
|
-
if do_debug
|
245
|
-
p "------------------"
|
246
|
-
p options
|
247
|
-
p [:reverse,do_reverse]
|
248
|
-
p [:complement,do_complement]
|
249
|
-
p [:trim,do_trim]
|
250
|
-
p [:orf_reverse, orf_reverse, rec0.strand]
|
251
|
-
end
|
252
|
-
|
253
|
-
if sequence.kind_of?(Bio::FastaFormat)
|
254
|
-
# BioRuby conversion
|
255
|
-
sequence = sequence.seq
|
256
|
-
end
|
257
|
-
# Generate array of sequences
|
258
|
-
seq = sectionlist.map { | section |
|
259
|
-
rec = section.rec
|
260
|
-
s = sequence[(section.begin-1)..(section.end-1)]
|
261
|
-
if do_reverse and orf_reverse
|
262
|
-
s = s.reverse
|
263
|
-
end
|
264
|
-
# Correct for phase. Unfortunately the use of phase is ambiguous.
|
265
|
-
# Here we check whether rec.start is in line with orf_frame. If it
|
266
|
-
# is, we correct for phase. Otherwise it is ignored.
|
267
|
-
if do_phase and rec.phase
|
268
|
-
phase = rec.phase.to_i
|
269
|
-
# if ((rec.start-startpos) % 3 == 0)
|
270
|
-
s = s[phase..-1]
|
271
|
-
# end
|
272
|
-
end
|
273
|
-
s
|
274
|
-
}
|
275
|
-
# p seq
|
276
|
-
seq = seq.join
|
277
|
-
if do_complement and do_reverse and orf_reverse
|
278
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
279
|
-
seq = ntseq.forward_complement.upcase
|
280
|
-
end
|
281
|
-
# This is the place to fix sequences (e.g. the Wormbase bug)
|
282
|
-
if do_fix or @options[:fix] or @options[:fix_wormbase]
|
283
|
-
if @options[:fix_wormbase] and rec0.id.index('gene1')==0
|
284
|
-
# Wormbase gene1 only, so ignore rest
|
285
|
-
else
|
286
|
-
test_frame = 0
|
287
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
288
|
-
aaseq = ntseq.translate
|
289
|
-
if aaseq.count('*') > 1
|
290
|
-
test_frame = 1
|
291
|
-
seq = seq[1..-1]
|
292
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
293
|
-
aaseq = ntseq.translate
|
294
|
-
if aaseq.count('*') > 1
|
295
|
-
test_frame = 2
|
296
|
-
seq = seq[1..-1]
|
297
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
298
|
-
aaseq = ntseq.translate
|
299
|
-
raise 'Validation problem '+rec0.id if aaseq.count('*') > 1
|
300
|
-
end
|
301
|
-
end
|
302
|
-
if test_frame > 0
|
303
|
-
warn rec0.id,"Frame adjusted to #{test_frame} (fix)"
|
304
|
-
end
|
305
|
-
end
|
306
|
-
end
|
307
|
-
if do_trim
|
308
|
-
reduce = seq.size % 3
|
309
|
-
seq = seq[0..(seq.size-1-reduce)] if reduce != 0
|
310
|
-
end
|
311
|
-
if @options[:validate]
|
312
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
313
|
-
aaseq = ntseq.translate
|
314
|
-
raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1
|
315
|
-
end
|
316
|
-
|
317
|
-
retval = seq
|
318
|
-
retval
|
319
|
-
end
|
320
|
-
|
321
|
-
# Patch a sequence together from a Sequence string and an array
|
322
|
-
# of records and translate in the correct direction and frame. The options
|
323
|
-
# are the same as for +assemble+, except :trim defaults to true.
|
324
|
-
def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true }
|
325
|
-
seq = assemble(sequence, startpos, reclist, options)
|
326
|
-
ntseq = Bio::Sequence::NA.new(seq)
|
327
|
-
ntseq.translate
|
328
|
-
end
|
329
|
-
|
330
|
-
# Create a description for output
|
331
|
-
def description id, component, rec
|
332
|
-
sections = Sections::sort(rec)
|
333
|
-
id+' Sequence:'+component.seqname+"_#{component.start}:#{component.end} ("+
|
334
|
-
sections.map { |s| "#{s.first}:#{s.last}" }.join(', ') +")"
|
335
|
-
end
|
336
|
-
|
337
|
-
end
|
338
|
-
end # Helpers
|
339
|
-
|
340
|
-
end
|
341
|
-
end
|