orf_finder 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/lib/orf.rb +306 -0
  3. data/lib/orf_common.rb +106 -0
  4. data/lib/orf_finder.rb +50 -0
  5. metadata +114 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 192ce308aeacff33641fd4f5aa62c7566787a47e
4
+ data.tar.gz: 6a55712c9764c424989fce7f46f68b76e6b7d259
5
+ SHA512:
6
+ metadata.gz: 46d2a795e81a037e61079ca4c0168fd57a3be4f84f090cda83da114412903cc8a2187787dfa4798f74ce00b570382e7f416078f11f722b401a67c9ab01b25012
7
+ data.tar.gz: c99c76e64c9dd9bcff8c4b69361ffd7215c5a2fc4b63442904c0e9dab6d5e2d0a652549c343ac2e787e5a41183cbaf89f4932223b34cd887ca59a1c507e8506b
data/lib/orf.rb ADDED
@@ -0,0 +1,306 @@
1
+ require 'bio'
2
+
3
+ require_relative 'orf_common'
4
+ require_relative 'orf_finder'
5
+ #
6
+ #
7
+ #
8
+ class ORF
9
+ #
10
+ include ORF::ORFCommon
11
+ #
12
+ #
13
+ attr_reader :logger, :options, :seq, :sequence
14
+ attr_writer :options
15
+
16
+ # class initializer that normalizes sequence to Bio::Sequence,
17
+ # merges given options and creates logger
18
+ def initialize(sequence, options = {}, logger_file = nil)
19
+ # logger for instance
20
+ if logger_file.nil?
21
+ @logger = Logger.new(STDOUT)
22
+ else
23
+ @logger = logger_file.clone
24
+ end
25
+ logger.progname = 'ORFCommon'
26
+ logger.level = (options[:debug] ? Logger::INFO : Logger::ERROR)
27
+ #
28
+ sequence = Bio::Sequence::NA.new(sequence) if sequence.class == String
29
+ @sequence = sequence
30
+ @seq = @sequence.to_s
31
+ #
32
+ self.options = ORFFinder::DEFAULT_OPTIONS.merge(options.nil? ? {} : options)
33
+
34
+ logger.info 'ORF has been initialized'
35
+ find
36
+ end
37
+
38
+ #
39
+ # For a given sequence, find longest ORF
40
+ #
41
+ def self.find(sequence, options = {})
42
+ # merge options with default
43
+ orf = ORF.new(sequence, options)
44
+ @result = orf.find
45
+ #
46
+ end
47
+
48
+ #
49
+ # return aminoacid sequence
50
+ def aa
51
+ # return already generated aa sequence
52
+ return @res_aa unless @res_aa.nil?
53
+ # save result
54
+ l = longest
55
+ return l if @res_aa.nil?
56
+ @res_aa
57
+ end
58
+
59
+ #
60
+ # return nucletotide sequence
61
+ def nt
62
+ return @res_nt unless @res_nt.nil?
63
+ longest
64
+ end
65
+
66
+ #
67
+ #
68
+ # finds all possible orfs in sequence
69
+ def find
70
+ # if sequence is nil or empty there is no point
71
+ # in trying to run the find algorithm
72
+ return sequence if sequence.nil? || sequence.size == 0
73
+ #
74
+ orf = { frame1: {}, frame2: {}, frame3: {} }
75
+ #
76
+ start_idx = all_codons_indices(:start)
77
+ stop_idx = all_codons_indices(:stop)
78
+ res = all_sequences(start_idx, stop_idx, seq.size, [0, 1, 2])
79
+ #
80
+ logger.info "start codons idx: #{start_idx}"
81
+ logger.info "stop codons idx: #{stop_idx}"
82
+ logger.info res
83
+ # iterate over each frame and range to return the
84
+ # longest above the minimum sequence length
85
+ # these are the preferences:
86
+ # 1: range that has start and stop codons
87
+ # 2: range that only has start/stop
88
+ # 3: full sequence
89
+ res.each_with_index do |frame, index|
90
+ find_longest(frame, index, orf)
91
+ end
92
+ # print ranges if debug is activated
93
+ orf.each { |k, f| f[:orfs].each { |r| print_range(k, r) } } \
94
+ if options[:debug]
95
+ #
96
+ @orf = orf
97
+ end
98
+
99
+ private
100
+
101
+ #
102
+ # iterate over all ranges in frame and find the longest
103
+ def find_longest(frame, index, orf)
104
+ # temporary arrays to keep valid and fallback ranges
105
+ frame_val = []
106
+ frame_fal = []
107
+ frame.each do |range|
108
+ if range[:fallback]
109
+ frame_fal << range
110
+ else
111
+ frame_val << range
112
+ end
113
+ end
114
+ # hash name
115
+ hash_name = frame_sym(index)
116
+ orf[hash_name][:orfs] = (frame_val.empty? ? frame_fal : frame_val)
117
+ longest = { len: nil, range: nil }
118
+ orf[hash_name][:orfs].each do |range|
119
+ len = range[:stop] - range[:start] + 1
120
+ if longest[:range].nil? || len > longest[:len]
121
+ longest[:len] = len
122
+ longest[:range] = range
123
+ end
124
+ end
125
+ orf[hash_name][:longest] = longest[:range]
126
+ end
127
+
128
+ #
129
+ # get the longest sequence in each frame and translate
130
+ # to aminoacid
131
+ def longest
132
+ # run find method if search has not been done
133
+ find if @orf.nil?
134
+ #
135
+ res_nt = { frame1: '', frame2: '', frame3: '' }
136
+ res_aa = res_nt.clone
137
+ # if @orf is empty then no point in continuing
138
+ return res_nt if @orf.nil? || @orf.size == 0
139
+ # for each orf get the longest sequence
140
+ @orf.each do |key, val|
141
+ res_nt[key] = get_range(val[:longest])
142
+ end
143
+ @res_nt = res_nt
144
+ # translate to aa sequence
145
+ unless @res_nt.nil?
146
+ @res_nt.each do |key, val|
147
+ res_aa[key] = val.translate
148
+ end
149
+ end
150
+ @res_aa = res_aa
151
+ # return the nucleotide sequence as default
152
+ res_nt
153
+ end
154
+
155
+ #
156
+ # Find all indexes for valid codons
157
+ # (either for :start or :stop)
158
+ def all_codons_indices(option_name)
159
+ idxs = []
160
+ option_name = option_name.to_sym
161
+ # if start option does not exist, then should
162
+ # treat start of sequence as the start
163
+ return idxs if options[option_name].nil? || options[option_name].empty?
164
+ # iterate over all start codons to see which
165
+ # is best
166
+ options[option_name].each do |codon|
167
+ # initialize temporary index as empty
168
+ temp_idxs = []
169
+ # index starts at position 0
170
+ new_idx = seq.index(codon, 0)
171
+ until new_idx.nil?
172
+ # necessary normalization
173
+ temp_idxs << index_normalization(option_name, new_idx)
174
+ new_idx = seq.index(codon, new_idx + 1)
175
+ end
176
+ idxs << temp_idxs
177
+ end
178
+ idxs.flatten.sort
179
+ end
180
+
181
+ #
182
+ # get indexes only from a given frame
183
+ # because of a bug the start flag must be given
184
+ # indicating if it is looking for start or stop
185
+ # codons in frame
186
+ def filter_codons_by_frame(idxs, frame, start = true)
187
+ idxs.collect do |i|
188
+ if start && (i - frame) % 3 == 0
189
+ i
190
+ elsif !start && (i + 1 - frame) % 3 == 0
191
+ i
192
+ end
193
+ end.compact
194
+ end
195
+
196
+ #
197
+ # from the combination of start and stop indexes, find
198
+ # the longest one
199
+ def valid_sequences_by_frame(start_idxs, stop_idxs, frame, seq_size)
200
+ #
201
+ seq_size -= (seq_size - frame) % 3
202
+ start = start_idxs.clone
203
+ stop = stop_idxs.clone
204
+ #
205
+ stop << seq_size - 1 if stop_idxs.empty?
206
+ start << frame if start_idxs.empty?
207
+ #
208
+ if options[:debug]
209
+ logger.info "frame: #{frame}"
210
+ logger.info " start: #{start} | stop :#{stop}"
211
+ logger.info " seq size: #{seq_size}"
212
+ logger.info " #{seq[frame..seq_size]}"
213
+ end
214
+ #
215
+ valid = []
216
+ fallback = []
217
+ # iterate on each start codon
218
+ sequences_in_frame({ start: start, stop: stop },
219
+ { valid: valid, fallback: fallback },
220
+ seq_size,
221
+ frame,
222
+ start_idxs.empty? || stop_idxs.empty?)
223
+ if valid.empty?
224
+ valid = fallback.uniq.collect do |r|
225
+ if get_range_str(r[:start], r[:stop], false).size == size_of_frame(frame)
226
+ nil
227
+ else
228
+ r
229
+ end
230
+ end.compact
231
+ logger.info 'no ORF with start and stop codons,' \
232
+ ' defaulting to fallback'
233
+ end
234
+ valid
235
+ end
236
+
237
+ #
238
+ # given star and stop codons indexes, decide which are the valid
239
+ # sequence for an orf
240
+ # TODO: reject sequences that have a stop codon in them
241
+ def sequences_in_frame(idxs, arrays, seq_size, frame, added_pos)
242
+ start = idxs[:start]
243
+ stop = idxs[:stop]
244
+ arr = []
245
+ #
246
+ #
247
+ # iterate on each start codon
248
+ start.each do |pos_start|
249
+ # iterate on each stop codon
250
+ stop.each do |pos_stop|
251
+ # add a fallback where starts from begining
252
+ # note: must check if from beggining to end there
253
+ # are stop codons, if so do not show it
254
+ if (pos_stop + 1 - frame) >= options[:min] &&
255
+ !(pos_stop > stop.bsearch { |el| el >= (frame - 1) })
256
+ arr << { start: frame, stop: pos_stop, fallback: true }
257
+ end
258
+ # ignore if start is bigger than stop index
259
+ next if pos_start >= pos_stop
260
+ # ignore if there is a stop codon between pos_start
261
+ # and pos_stop
262
+ next if pos_stop > stop.bsearch { |el| el >= (pos_start - 1) }
263
+ # ignore if size of orf is smaller than minimum
264
+ next if (pos_stop + 1 - pos_start) < options[:min]
265
+ # if all conditions hold add as valid orf
266
+ arr << { start: pos_start,
267
+ stop: pos_stop,
268
+ fallback: added_pos }
269
+ end
270
+ next unless ((seq_size - 1) - pos_start) >= options[:min]
271
+
272
+ next if !(temp_res = stop.bsearch { |el| el >= (pos_start - 1) }).nil? &&
273
+ (seq_size - 1) > temp_res
274
+ arr << { start: pos_start,
275
+ stop: seq_size - 1,
276
+ fallback: true }
277
+ end
278
+ #
279
+ arr.each do |item|
280
+ if item[:fallback]
281
+ arrays[:fallback] << item
282
+ else
283
+ arrays[:valid] << item
284
+ end
285
+ end
286
+ end
287
+
288
+ #
289
+ #
290
+ #
291
+ def all_sequences(start_idx, stop_idx, seq_size, read_frame = [0, 1, 2])
292
+ #
293
+ start = [[], [], []]
294
+ stop = [[], [], []]
295
+ valid = []
296
+ read_frame.each do |frame|
297
+ start[frame] = filter_codons_by_frame(start_idx, frame, true)
298
+ stop[frame] = filter_codons_by_frame(stop_idx, frame, false)
299
+ valid << valid_sequences_by_frame(start[frame],
300
+ stop[frame],
301
+ frame, seq_size)
302
+ end
303
+ #
304
+ valid
305
+ end
306
+ end
data/lib/orf_common.rb ADDED
@@ -0,0 +1,106 @@
1
+ require 'logger'
2
+ #
3
+ #
4
+ #
5
+ class ORF
6
+ module ORFCommon
7
+ #
8
+
9
+ def range_to_s(range, str = '')
10
+ print_range(str, range)
11
+ end
12
+
13
+ private
14
+
15
+ #
16
+ # transform range to sequence
17
+ def get_range(arg1, arg2 = nil)
18
+ return Bio::Sequence::NA.new('') if arg1.nil?
19
+ if arg2.nil?
20
+ start = arg1[:start]
21
+ stop = arg1[:stop]
22
+ else
23
+ start = arg1
24
+ stop = arg2
25
+ end
26
+ Bio::Sequence::NA.new(get_range_str(start, stop))
27
+ end
28
+
29
+ #
30
+ # tranform range to string
31
+ def get_range_str(start, stop, include_codons = true)
32
+ # check if there is a start codon before start
33
+ # and an end codon after stop, if there is, show it!
34
+ start_codon = ''
35
+ stop_codon = ''
36
+ if include_codons
37
+ if start - 3 >= 0 &&
38
+ options[:start].include?(seq[(start - 3)..(start - 1)])
39
+ start_codon = seq[(start - 3)..(start - 1)]
40
+ end
41
+
42
+ if stop + 3 <= seq.size - 1 &&
43
+ options[:stop].include?(seq[(stop + 1)..(stop + 3)])
44
+ stop_codon = seq[(stop + 1)..(stop + 3)]
45
+ end
46
+ end
47
+ "#{start_codon}#{seq[start..stop]}#{stop_codon}"
48
+ end
49
+
50
+ #
51
+ # auxiliary method that prints range
52
+ def print_range(key, range)
53
+ # simple proc to add spaces, works as auxiliary
54
+ # method to print range
55
+ add_spaces = proc do |str|
56
+ str.gsub(/([atgc]{1})/, '\1 ').strip
57
+ end
58
+ if range.nil?
59
+ str = "#{key} : (empty)"
60
+ else
61
+ orf = add_spaces.call(get_range_str(range[:start], range[:stop]))
62
+ pre = if range[:start] == 0
63
+ ''
64
+ else
65
+ add_spaces.call(get_range_str(0, range[:start] - 1))
66
+ end
67
+ suf = if range[:end] == seq.size - 1
68
+ ''
69
+ else
70
+ add_spaces.call(get_range_str(range[:stop] + 1, seq.size - 1))
71
+ end
72
+ #
73
+ sep = '|'
74
+ str = "#{key}: #{pre}#{sep}#{orf}#{sep}#{suf}"
75
+ str += ' : ' \
76
+ "size=#{seq[range[:start]..range[:stop]].size}"
77
+ str += ' (fallback)' if range[:fallback]
78
+ end
79
+ puts str
80
+ end
81
+
82
+ #
83
+ # necessary normalization for index to start after
84
+ # start codon and end just before stop codon
85
+ # example: aaa atg aaa aaa taa aaa
86
+ # the search results in codon 2 and 5, while the
87
+ # resulting ord are codons 3 and 4
88
+ def index_normalization(option_name, idx)
89
+ if option_name == :start
90
+ idx + 3
91
+ elsif option_name == :stop
92
+ idx - 1
93
+ end
94
+ end
95
+
96
+ #
97
+ # create hash symbol from index
98
+ def frame_sym(index)
99
+ "frame#{index + 1}".to_sym
100
+ end
101
+
102
+ def size_of_frame(frame)
103
+ seq.size - frame - (seq.size - frame) % 3
104
+ end
105
+ end
106
+ end
data/lib/orf_finder.rb ADDED
@@ -0,0 +1,50 @@
1
+ require_relative 'orf'
2
+
3
+ #
4
+ #
5
+ # Wrapper class that processes the direct and reverse sequences
6
+ class ORFFinder
7
+ #
8
+ DEFAULT_OPTIONS = { start: %w(atg),
9
+ stop: %w(tag taa tga),
10
+ reverse: true,
11
+ direct: true,
12
+ min: 6,
13
+ default_to_seq: false,
14
+ debug: false }
15
+
16
+ def initialize(sequence, options = {}, logger = nil)
17
+ #
18
+ sequence = Bio::Sequence::NA.new(sequence) if sequence.class == String
19
+ options = DEFAULT_OPTIONS.merge(options.nil? ? {} : options)
20
+ #
21
+ @output = {}
22
+ @output[:direct] = ORF.new(sequence, options, logger) if options[:direct]
23
+ @output[:reverse] = ORF.new(sequence.complement, options, logger) \
24
+ if options[:reverse]
25
+ end
26
+
27
+ def nt
28
+ res = {}
29
+ @output.each do |key, value|
30
+ res[key] = value.nt
31
+ end
32
+ res
33
+ end
34
+
35
+ def aa
36
+ res = {}
37
+ @output.each do |key, value|
38
+ res[key] = value.aa
39
+ end
40
+ res
41
+ end
42
+
43
+ def direct
44
+ @output[:direct]
45
+ end
46
+
47
+ def reverse
48
+ @output[:reverse]
49
+ end
50
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orf_finder
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - André Veríssimo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bio
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.5.0
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.5'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.5.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: byebug
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '8.2'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 8.2.1
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '8.2'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 8.2.1
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '3.4'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 3.4.0
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '3.4'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 3.4.0
73
+ description: |2
74
+ ORF Finder is a library that with a sequence of nucletotides it
75
+ finds the all the possible ORFs in the sequence.
76
+ It will look for a sequence that starts with a start codon and
77
+ ends with a stop codon.
78
+ It will default to the beggining of the sequence if it cannot
79
+ find an ORF long enought with the start codons. It will also
80
+ use the end of the sequence if no stop codons are present in the
81
+ sequence reading frame.
82
+ email: andre.verissimo@tecnico.ulisboa.pt
83
+ executables: []
84
+ extensions: []
85
+ extra_rdoc_files: []
86
+ files:
87
+ - lib/orf.rb
88
+ - lib/orf_common.rb
89
+ - lib/orf_finder.rb
90
+ homepage: http://rubygems.org/gems/hola
91
+ licenses:
92
+ - GPL v3
93
+ metadata: {}
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ requirements: []
109
+ rubyforge_project:
110
+ rubygems_version: 2.4.8
111
+ signing_key:
112
+ specification_version: 4
113
+ summary: Finds the longest orfs in a nucleotide sequence.
114
+ test_files: []