bio-bigbio 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+
2
+ require 'set'
3
+
4
+ module Bio
5
+ module Big
6
+
7
+ module FrameCodonHelpers
8
+
9
+ STOP_CODONS = Set.new(%w{TAG TAA TGA UAG UAA UGA})
10
+ START_CODONS = Set.new(%w{ATG AUG})
11
+
12
+ # Track sequence position in parent sequence (in nucleotides)
13
+ module TrackSequenceTrait
14
+ attr_accessor :track_ntseq_pos
15
+ def TrackSequenceTrait.update_sequence_pos orfs, ntseq_pos
16
+ orfs.each { | orf | orf.track_ntseq_pos = ntseq_pos + orf.pos*3 }
17
+ orfs
18
+ end
19
+ def TrackSequenceTrait.update_reversed_sequence_pos orfs, ntseq_pos
20
+ # is the same
21
+ orfs.each { | orf | orf.track_ntseq_pos = ntseq_pos + orf.pos*3 }
22
+ orfs
23
+ end
24
+ end
25
+
26
+ # Functions that move a frame forward, or backward,
27
+ # creating new short frames.
28
+ module CreateShortFrame
29
+
30
+ def CreateShortFrame.create_right fr,orfs,rseq
31
+ seq = fr.seq
32
+ ntseq_pos = fr.ntseq_pos
33
+ remove = if orfs.size > 0
34
+ orfs.last.rpos*3
35
+ else
36
+ 0
37
+ end
38
+ ntseq_pos += remove
39
+ nseq = seq[remove..-1] + rseq
40
+ ShortFrameState.new nseq,ntseq_pos,fr.min_size_codons*3
41
+ end
42
+
43
+ def CreateShortFrame.create_left fr,orfs,nseq
44
+ # Reversed (real locations on contig):
45
+ #
46
+ # | 3 21 B |
47
+ # ttaaatgtaatttaggtaaatttat atgtaaattaggta (reversed)
48
+ # ...^--============xxx^=======xxx
49
+ # ^ ^
50
+ # Actual feed:
51
+ #
52
+ # s2= s1=
53
+ # "atggattaaatgta" "tatttaaatggatttaatgtaaatt"
54
+ # ......xxx===== ~===xx^============--^...
55
+ # 0 1 2 3 0 1 2 3
56
+ seq1 = fr.seq # original sequence
57
+ len1 = seq1.size
58
+ ntseq_pos1 = fr.ntseq_pos # right side of seq (|)
59
+ bridge = len1 % 3 # chomp left side (B)
60
+ remove = if orfs.size > 0
61
+ len1 - bridge - (orfs.first.pos)*3 + 1
62
+ else
63
+ 0
64
+ end
65
+ ntseq_pos2 = ntseq_pos1+remove-1 # pos against main contig
66
+ seq2 = nseq + seq1[0..(len1-remove)]
67
+ ShortReversedFrameState.new seq2,ntseq_pos2,fr.min_size_codons*3
68
+ end
69
+ end
70
+
71
+ class FrameCodonSequence
72
+ include Enumerable
73
+ include TrackSequenceTrait
74
+ attr_reader :pos # codon position in short parent sequence
75
+ attr_reader :codons
76
+ def initialize seq, pos=0
77
+ if seq.kind_of?(String)
78
+ @codons = seq.upcase.scan(/(\w\w\w)/).flatten
79
+ else
80
+ @codons = seq
81
+ end
82
+ @pos = pos
83
+ end
84
+ def size
85
+ @codons.size
86
+ end
87
+ def rpos
88
+ pos + size
89
+ end
90
+ def [] index
91
+ @codons[index]
92
+ end
93
+ def shift
94
+ list = @codons
95
+ list.shift
96
+ FrameCodonSequence.new(list,@pos+1)
97
+ end
98
+ def to_seq
99
+ @codons.join
100
+ end
101
+ def each
102
+ @codons.each { | c| yield c }
103
+ end
104
+ end
105
+ end # FrameCodonHelpers
106
+
107
+ # The short frame uses the simplest concept to find ORFs. The sequence is
108
+ # immutable, always forward and in frame 0. That makes it easy to reason.
109
+ # It also return all ORF's in one go, with the left/right locations.
110
+
111
+ class ShortFrameState
112
+ include FrameCodonHelpers
113
+
114
+ attr_reader :seq, :ntseq_pos, :min_size_codons, :codons
115
+
116
+ def initialize seq, ntseq_pos, ntmin_size
117
+ @reversed = nil
118
+ # @seq = seq.upcase
119
+ @seq = seq
120
+ @min_size_codons = if ntmin_size > 3
121
+ (ntmin_size/3).to_i
122
+ else
123
+ 2 # otherwise we get single STOP codons
124
+ end
125
+
126
+ @codons = FrameCodonSequence.new(seq,ntseq_pos)
127
+ @ntseq_pos = ntseq_pos # nucleotides
128
+ # @codons.track_sequence_pos = seq_pos
129
+ end
130
+
131
+ # Return a list of ORFs delimited by STOP codons.
132
+ def get_stopstop_orfs
133
+ get_codon_orfs1(Proc.new { | codon | STOP_CODONS.include?(codon) },false,true)
134
+ end
135
+
136
+ # Return a list of ORFs delimited by START-STOP codons
137
+ def get_startstop_orfs
138
+ get_codon_orfs2(
139
+ Proc.new { | codon | STOP_CODONS.include?(codon) },
140
+ Proc.new { | codon | START_CODONS.include?(codon) })
141
+ end
142
+
143
+ # Splitter for one delimiter function. +include_leftmost+ decides
144
+ # the first sequence is returned when incomplete. +strip_leading+
145
+ # is used to remove the shared codon with the last sequence.
146
+ #
147
+ def get_codon_orfs1 splitter_func,do_include_leftmost_orf,do_strip_leading_codon
148
+ orfs = split(@codons,splitter_func)
149
+ return [] if orfs.size == 0
150
+ # Drop the first sequence, if there is no match on the first position
151
+ orfs.shift if !do_include_leftmost_orf and !splitter_func.call(orfs.first[0])
152
+ orfs = orfs.map { |codons|
153
+ codons = codons.shift if do_strip_leading_codon and splitter_func.call(codons[0])
154
+ codons
155
+ }
156
+ if @reversed == nil
157
+ TrackSequenceTrait.update_sequence_pos(orfs,@ntseq_pos) # nail against parent
158
+ else
159
+ TrackSequenceTrait.update_reversed_sequence_pos(orfs,@ntseq_pos) # nail against parent
160
+ end
161
+ end
162
+
163
+ # Splitter for two delimeter functions
164
+ def get_codon_orfs2 splitter_func, start_func
165
+ orfs = get_codon_orfs1(splitter_func,true,true)
166
+ orfs.find_all { | orf | start_func.call(orf[0]) }
167
+ end
168
+
169
+ # Return list of codon sequences, split on the +is_splitter+
170
+ # function.
171
+ #
172
+ def split codons, is_splitter_func
173
+ list = []
174
+ node = []
175
+ codons.each_with_index do | c, pos |
176
+ # p [c,pos]
177
+ if is_splitter_func.call(c)
178
+ node.push c
179
+ size = node.size
180
+ # p node
181
+ list.push FrameCodonSequence.new(node,pos+1-size) if size > @min_size_codons
182
+ node = []
183
+ end
184
+ node.push c # always push boundary codon
185
+ end
186
+ list
187
+ end
188
+
189
+ end
190
+
191
+ # This is the reversed version, which is rather the same as the forward,
192
+ # though the tracked ntseq_pos should be seen from the end of the sequence,
193
+ # as we are emmiting sequences from the end(!) Also we need to make sure
194
+ # the sequence is always in frame (from the left).
195
+ class ShortReversedFrameState < ShortFrameState
196
+ attr_accessor :reversed
197
+
198
+ def initialize seq, ntseq_pos, ntmin_size
199
+ @reversed = true
200
+ chop = seq.size % 3 # align on codons
201
+ super seq[chop..-1],ntseq_pos,ntmin_size
202
+ @seq = seq # but record full seq
203
+ end
204
+
205
+ end
206
+
207
+ class OrfEmitter
208
+
209
+ # 6-frame ORF emitter for (growing) sequences from the +emit+
210
+ # object. Type can be a symbol or a function. Symbols are
211
+ #
212
+ # :stopstop All sequences from STOP to STOP codon
213
+ # :startstop All sequences from START to STOP codon
214
+ #
215
+ # size control is in nucleotides.
216
+ #
217
+ # The difference with most other getorf implementations, including
218
+ # EMBOSS, is that:
219
+ #
220
+ # 1) ORFs get emitted during the reading of large continuous sequences,
221
+ # e.g. chromosomes.
222
+ # 2) This allows processing in parallel to IO, even on a single CPU
223
+ # 3) ORFs come with splitting CODONs
224
+ # 4) Bordering ORFs are not included (by default), which is somehow
225
+ # not easy with EMBOSS getorf
226
+ #
227
+ # I have carefully designed this code, so it is easy to reason about
228
+ # the steps and prove correct. It is easy to understand, and
229
+ # therefore to parallelize correctly. Some features are:
230
+ #
231
+ # 5) Emit size does not matter for correctness
232
+ # 6) Reverse strands are positioned according to
233
+ # GFF3 on the parent contig
234
+ #
235
+ def initialize emit, type, min_size=30, max_size=nil
236
+ @em = emit
237
+ @type = type
238
+ @min_size = min_size
239
+ @max_size = max_size
240
+ end
241
+
242
+ # Concats sequences from the emitter and yields the
243
+ # contained ORFs for every resulting frame (-3..-1, 1..3 ). Note
244
+ # that for the reverse frame, the resulting sequence is complemented!
245
+ # Translate these sequences in a forward frame only.
246
+ #
247
+ # First :head, then :mid parts get emitted, closed by the :tail part.
248
+ #
249
+ def emit_seq
250
+ @em.emit_seq do | part, index, tag, seq |
251
+ # p [part, seq]
252
+ # case part do
253
+ # when :head
254
+ # when :mid
255
+ # when :tail
256
+ # end
257
+ emit_forward(part, index, tag, seq) { |*x| yield(*x) }
258
+ emit_reverse(part, index, tag, seq) { |*x| yield(*x) }
259
+ end
260
+ end
261
+
262
+ private
263
+
264
+ def emit_forward(part, index, tag, seq)
265
+ # Yield frame 1..3
266
+ (1..3).each do | frame |
267
+ fr = ShortFrameState.new seq[frame-1..-1],0,0
268
+ orfs = fr.get_stopstop_orfs
269
+ orfs.each do | orf |
270
+ yield frame, index, tag, orf.track_ntseq_pos, orf.to_seq
271
+ end
272
+ end
273
+ end
274
+
275
+ def emit_reverse(part, index, tag, seq)
276
+ # Yield frame -1..-3
277
+ ntseq = Bio::Sequence::NA.new(seq)
278
+ rev_seq = ntseq.complement
279
+ (1..3).each do | frame |
280
+ fr = ShortReversedFrameState.new rev_seq[0..rev_seq.size-frame],0,0
281
+ orfs = fr.get_stopstop_orfs
282
+ orfs.each do | orf |
283
+ yield(-frame,index,tag,orf.track_ntseq_pos,orf.to_seq)
284
+ end
285
+ end
286
+ end
287
+ end
288
+ end
289
+ end
@@ -0,0 +1,3 @@
1
+
2
+ class FastaIndex
3
+ end
@@ -0,0 +1,19 @@
1
+ # FASTA paired reader keeps track of two FASTA files containing
2
+ # matching NT and AA sequences.
3
+ #
4
+
5
+ class FastaPairedReader
6
+
7
+ def initialize ntfn, aafn, opts={:regex => '(\S+)'}
8
+ @nt = FastaReader.new(ntfn, opts)
9
+ @aa = FastaReader.new(aafn, opts)
10
+ end
11
+
12
+ # return a NT+AA pair
13
+ def get id
14
+ nt = @nt.get(id)
15
+ aa = @aa.get(id)
16
+ FastaPairedRecord.new(nt, aa)
17
+ end
18
+
19
+ end
@@ -0,0 +1,21 @@
1
+ # Paired FASTA writer (tracks matching NT and AA sequences in two
2
+ # FASTA files)
3
+ #
4
+
5
+ class FastaPairedWriter
6
+
7
+ def initialize ntfn, aafn
8
+ @nt = FastaWriter.new(ntfn)
9
+ @aa = FastaWriter.new(aafn)
10
+ end
11
+
12
+ def write rec
13
+ @nt.write rec.nt
14
+ @aa.write rec.aa
15
+ end
16
+
17
+ def close
18
+ @nt.close
19
+ @aa.close
20
+ end
21
+ end
@@ -0,0 +1,132 @@
1
+ # Indexed FastaReader
2
+ #
3
+
4
+ require 'bigbio/db/fasta/indexer'
5
+
6
+ class FastaReader
7
+
8
+ include Indexer
9
+
10
+ # Initalize the reader of FASTA file _fn_. Options can be :regex and
11
+ # :index (true/false)
12
+ def initialize fn, opts = {}
13
+ @f = File.open(fn)
14
+ @fread_once = false
15
+ @regex = opts[:regex]
16
+ @regex = '^(\S+)' if @regex == nil
17
+ indexer_use opts[:index]
18
+ end
19
+
20
+ # Parse the FASTA file and yield id, descr, sequence. When the indexer is on
21
+ # it will index the records the first time. Note that, with indexing, when
22
+ # you don't complete parsing there will be an error the second time. This is
23
+ # a # trade-off, otherwise one would always have to index the file and read
24
+ # it twice.
25
+ def parse_each
26
+ @f.seek 0 # force file rewind
27
+ @rec_fpos = 0
28
+ @rec_line = @f.gets
29
+ fpos = 0
30
+ @count = 0
31
+ begin
32
+ # digest id from record description
33
+ id, descr = digest_tag(@rec_line)
34
+ id_fpos = @rec_fpos
35
+ # parse the sequence
36
+ seq = ""
37
+ begin
38
+ fpos = @f.tell
39
+ line = @f.gets
40
+ break if line =~ /^>/
41
+ seq += line.strip
42
+ end while !@f.eof
43
+ # new record
44
+ @count += 1
45
+ @rec_fpos = fpos
46
+ @rec_line = line
47
+ # p [@rec_line, id, id_fpos]
48
+ indexer_set(id, id_fpos) if @indexer and not @fread_once
49
+ yield id, descr, seq
50
+ end while !@f.eof
51
+ @fread_once = true
52
+ end
53
+
54
+ # returns a FastaRecord for every item (invokes parse_each)
55
+ def each
56
+ parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) }
57
+ end
58
+
59
+ def first
60
+ parse_each { | id, descr, seq |
61
+ return FastaRecord.new(id, descr, seq)
62
+ }
63
+ end
64
+
65
+ # Return a record by its +id+, nil when not found
66
+ def get id
67
+ indexed?
68
+ if fpos = indexer_get(id)
69
+ get_rec(fpos)
70
+ else
71
+ nil
72
+ end
73
+ end
74
+
75
+ def get_rec fpos
76
+ @f.seek fpos
77
+ tag = @f.gets
78
+ seq = ""
79
+ begin
80
+ line = @f.gets
81
+ break if line =~ /^>/
82
+ seq += line.strip
83
+ end while !@f.eof
84
+ id, descr = digest_tag(tag)
85
+ FastaRecord.new(id,descr,seq)
86
+ end
87
+
88
+ def get_by_index idx
89
+ indexed?
90
+ if fpos = indexer_get_by_index(idx)[1]
91
+ ret = get_rec(fpos)
92
+ return ret
93
+ end
94
+ nil
95
+ end
96
+
97
+ def digest_tag tag
98
+ if tag =~ /^>/
99
+ descr = $'.strip
100
+ if descr =~ /#{@regex}/
101
+ id = $1
102
+ # p [descr,id]
103
+ return id, descr
104
+ end
105
+ p descr # do not remove these
106
+ p @regex
107
+ end
108
+ raise "Can not digest '#{tag}' using '"+@regex+"'"
109
+ end
110
+
111
+ # Returns the size of the dataset - as read. After the final
112
+ # record the size represents the number of items in the FASTA file
113
+ def size
114
+ @count
115
+ end
116
+
117
+ def close
118
+ @f.close
119
+ end
120
+
121
+ private
122
+
123
+ def indexed?
124
+ if @indexer and not @fread_once
125
+ # force indexer
126
+ # $stderr.print "Force indexer"
127
+ parse_each { | x, y, z | nil }
128
+ end
129
+ true
130
+ end
131
+
132
+ end
@@ -0,0 +1,39 @@
1
+
2
+ class FastaRecord
3
+ attr_accessor :id, :descr, :seq
4
+
5
+ def initialize id, descr, seq
6
+ @id = id
7
+ @descr = descr
8
+ @seq = seq
9
+ end
10
+ end
11
+
12
+ class FastaPairedRecord
13
+ attr_reader :nt, :aa
14
+
15
+ def initialize nt, aa
16
+ @nt = nt
17
+ @aa = aa
18
+ raise "ID error NT #{nt.id} not matching AA #{aa.id}" if nt.id != aa.id
19
+ if nt.seq.size == aa.seq.size*3-1
20
+ # account for EMBOSS cleverness
21
+ nt.seq.chop!
22
+ nt.seq.chop!
23
+ aa.seq.chop!
24
+ end
25
+ if nt.seq.size == aa.seq.size*3-2
26
+ # account for EMBOSS cleverness
27
+ nt.seq.chop!
28
+ aa.seq.chop!
29
+ end
30
+ if nt.seq.size == aa.seq.size*3-3
31
+ aa.seq.chop!
32
+ end
33
+ raise "Sequence size mismatch for #{nt.id} <nt:#{nt.seq.size} != #{aa.seq.size*3} (aa:#{aa.seq.size}*3)>" if nt.seq.size != aa.seq.size*3
34
+ end
35
+
36
+ def id
37
+ @aa.id
38
+ end
39
+ end
@@ -0,0 +1,20 @@
1
+ # Fasta writer
2
+
3
+ class FastaWriter
4
+
5
+ # Open a FASTA stream for writing
6
+ def initialize fn
7
+ @f = File.open(fn,"w")
8
+ end
9
+
10
+ # write a FASTA item
11
+ def write item
12
+ @f.write ">"+item.id+' '+item.descr+"\n"
13
+ @f.write item.seq.strip+"\n"
14
+ end
15
+
16
+ def close
17
+ @f.close
18
+ end
19
+
20
+ end
@@ -0,0 +1,33 @@
1
+ # Indexer module for the FASTA class
2
+ #
3
+ # This is a simple memory based key storage
4
+ #
5
+
6
+ module Indexer
7
+
8
+ # Start using the indexer
9
+ def indexer_use state
10
+ if state
11
+ @indexer = {}
12
+ end
13
+ end
14
+
15
+ def indexer_set key, fpos
16
+ raise "Trying to use 'set' when there is no index" if @indexer == nil
17
+ raise "Indexer key #{key} alread in use for <#{@indexer[key]}>!" if @indexer[key]
18
+ # p [key, fpos]
19
+ @indexer[key] = fpos
20
+ end
21
+
22
+ # Get the key, return nil when not found
23
+ def indexer_get key
24
+ raise "Trying to use 'get' when there is no index" if @indexer == nil
25
+ # raise "Indexer key #{key} not found!" if !@indexer[key]
26
+ @indexer[key]
27
+ end
28
+
29
+ def indexer_get_by_index idx
30
+ @indexer.sort {|a,b| a[1]<=>b[1]} [idx]
31
+ end
32
+ end
33
+
@@ -0,0 +1,13 @@
1
+ # fasta.rb
2
+ #
3
+ # This is a Bilib reference implementation of a FASTA reader and writer in
4
+ # Ruby.
5
+ #
6
+ # by Pjotr Prins (c) 2009
7
+ #
8
+
9
+ require 'bigbio/db/fasta/fastarecord'
10
+ require 'bigbio/db/fasta/fastareader'
11
+ require 'bigbio/db/fasta/fastawriter'
12
+ require 'bigbio/db/fasta/fastapairedreader'
13
+ require 'bigbio/db/fasta/fastapairedwriter'
@@ -0,0 +1,12 @@
1
+
2
+ require 'singleton'
3
+
4
+ module Bio
5
+ module Big
6
+ class Environment
7
+ include Singleton
8
+
9
+ attr_accessor :log, :biolib
10
+ end
11
+ end
12
+ end