transrate 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +1 -0
- data/LICENSE +2 -15
- data/README.md +14 -132
- data/Rakefile +19 -2
- data/bin/transrate +49 -10
- data/deps/deps.yaml +0 -10
- data/docs/transrate_logo_full.png +0 -0
- data/ext/transrate/extconf.rb +13 -0
- data/ext/transrate/transrate.c +223 -0
- data/lib/transrate.rb +1 -0
- data/lib/transrate/assembly.rb +12 -10
- data/lib/transrate/bowtie2.rb +7 -0
- data/lib/transrate/comparative_metrics.rb +103 -73
- data/lib/transrate/contig.rb +94 -93
- data/lib/transrate/contig_metrics.rb +1 -2
- data/lib/transrate/read_metrics.rb +13 -7
- data/lib/transrate/version.rb +1 -1
- data/test/helper.rb +1 -31
- data/test/test_bin.rb +99 -0
- data/test/test_bowtie.rb +12 -0
- data/test/test_comp_metrics.rb +161 -104
- data/test/test_contig.rb +62 -6
- data/test/test_contig_metrics.rb +2 -2
- data/test/test_inline.rb +2 -2
- data/test/test_transrater.rb +1 -1
- data/transrate.gemspec +5 -4
- metadata +40 -22
data/lib/transrate.rb
CHANGED
data/lib/transrate/assembly.rb
CHANGED
@@ -26,7 +26,7 @@ module Transrate
|
|
26
26
|
|
27
27
|
include Enumerable
|
28
28
|
extend Forwardable
|
29
|
-
def_delegators :@assembly, :each, :<<, :size, :length
|
29
|
+
def_delegators :@assembly, :each, :each_value, :<<, :size, :length, :[]
|
30
30
|
|
31
31
|
attr_accessor :file
|
32
32
|
attr_reader :assembly
|
@@ -43,11 +43,12 @@ module Transrate
|
|
43
43
|
unless File.exist? @file
|
44
44
|
raise IOError.new "Assembly file doesn't exist: #{@file}"
|
45
45
|
end
|
46
|
-
@assembly =
|
46
|
+
@assembly = {}
|
47
47
|
@n_bases = 0
|
48
48
|
Bio::FastaFormat.open(file).each do |entry|
|
49
49
|
@n_bases += entry.length
|
50
|
-
|
50
|
+
contig = Contig.new(entry)
|
51
|
+
@assembly[contig.name] = contig
|
51
52
|
end
|
52
53
|
@contig_metrics = ContigMetrics.new self
|
53
54
|
end
|
@@ -78,7 +79,7 @@ module Transrate
|
|
78
79
|
# @return [Hash] basic statistics about the assembly
|
79
80
|
def basic_stats threads=1
|
80
81
|
return @basic_stats if @basic_stats
|
81
|
-
bin = @assembly.
|
82
|
+
bin = @assembly.values
|
82
83
|
@basic_stats = basic_bin_stats bin
|
83
84
|
@basic_stats
|
84
85
|
end # basic_stats
|
@@ -103,7 +104,7 @@ module Transrate
|
|
103
104
|
# representing contigs in the assembly
|
104
105
|
|
105
106
|
def basic_bin_stats bin
|
106
|
-
|
107
|
+
|
107
108
|
# cumulative length is a float so we can divide it
|
108
109
|
# accurately later to get the mean length
|
109
110
|
cumulative_length = 0.0
|
@@ -194,7 +195,7 @@ module Transrate
|
|
194
195
|
covfile = Samtools.coverage bam
|
195
196
|
# get an assembly enumerator
|
196
197
|
assembly_enum = @assembly.to_enum
|
197
|
-
contig = assembly_enum.next
|
198
|
+
contig_name, contig = assembly_enum.next
|
198
199
|
# precreate an array of the correct size to contain
|
199
200
|
# coverage. this is necessary because samtools mpileup
|
200
201
|
# doesn't print a result line for bases with 0 coverage
|
@@ -209,12 +210,13 @@ module Transrate
|
|
209
210
|
break
|
210
211
|
end
|
211
212
|
# extract the columns
|
212
|
-
name
|
213
|
-
|
214
|
-
|
213
|
+
name = Bio::FastaDefline.new(cols[name_i]).entry_id
|
214
|
+
pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
|
215
|
+
unless contig_name == name
|
216
|
+
while contig_name != name
|
215
217
|
begin
|
216
218
|
block.call(contig, contig.coverage)
|
217
|
-
contig = assembly_enum.next
|
219
|
+
contig_name, contig = assembly_enum.next
|
218
220
|
contig.coverage = Array.new(contig.length, 0)
|
219
221
|
rescue StopIteration => stop_error
|
220
222
|
logger.error 'reached the end of assembly enumerator while ' +
|
data/lib/transrate/bowtie2.rb
CHANGED
@@ -49,6 +49,9 @@ module Transrate
|
|
49
49
|
# run bowtie
|
50
50
|
runner = Cmd.new bowtiecmd
|
51
51
|
runner.run
|
52
|
+
if !runner.status.success?
|
53
|
+
raise Bowtie2Error.new("Bowtie2 failed\n#{runner.stderr}")
|
54
|
+
end
|
52
55
|
end
|
53
56
|
@sam
|
54
57
|
end
|
@@ -59,6 +62,10 @@ module Transrate
|
|
59
62
|
cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
|
60
63
|
runner = Cmd.new cmd
|
61
64
|
runner.run
|
65
|
+
if !runner.status.success?
|
66
|
+
msg = "Failed to build Bowtie2 index\n#{runner.stderr}"
|
67
|
+
raise Bowtie2Error.new(msg)
|
68
|
+
end
|
62
69
|
end
|
63
70
|
@index_built = true
|
64
71
|
end
|
@@ -10,86 +10,89 @@ module Transrate
|
|
10
10
|
attr_reader :reciprocal_hits
|
11
11
|
attr_reader :has_run
|
12
12
|
attr_reader :reference_coverage
|
13
|
+
attr_reader :comp_stats
|
13
14
|
attr_reader :n_chimeras, :p_chimeras
|
14
15
|
|
15
16
|
def initialize assembly, reference, threads
|
16
17
|
@assembly = assembly
|
17
18
|
@reference = reference
|
18
19
|
@threads = threads
|
20
|
+
@comp_stats = Hash.new
|
19
21
|
end
|
20
22
|
|
21
23
|
def run
|
22
24
|
@crbblast = reciprocal_best_blast
|
23
|
-
@
|
24
|
-
@collapse_factor = collapse_factor @crbblast.
|
25
|
+
@reference_coverage = coverage @crbblast
|
26
|
+
@collapse_factor = collapse_factor @crbblast.reciprocals
|
25
27
|
@reciprocal_hits = @crbblast.size
|
26
28
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
27
|
-
@reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
|
28
|
-
@rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
|
29
29
|
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
30
|
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
|
+
count_ref_crbbs
|
31
32
|
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
32
33
|
chimeras @crbblast
|
34
|
+
self.run_comp_stats
|
33
35
|
@has_run = true
|
34
36
|
end
|
35
37
|
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
:collapse_factor => @collapse_factor,
|
48
|
-
:n_chimeras => @n_chimeras,
|
49
|
-
:p_chimeras => @p_chimeras,
|
50
|
-
:cov25 => @cov[0],
|
51
|
-
:cov50 => @cov[1],
|
52
|
-
:cov75 => @cov[2],
|
53
|
-
:cov85 => @cov[3],
|
54
|
-
:cov95 => @cov[4],
|
55
|
-
:p_cov25 => @cov[0]/@reference.size.to_f,
|
56
|
-
:p_cov50 => @cov[1]/@reference.size.to_f,
|
57
|
-
:p_cov75 => @cov[2]/@reference.size.to_f,
|
58
|
-
:p_cov85 => @cov[3]/@reference.size.to_f,
|
59
|
-
:p_cov95 => @cov[4]/@reference.size.to_f
|
60
|
-
}
|
38
|
+
def run_comp_stats
|
39
|
+
@comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
|
40
|
+
@comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
|
41
|
+
@comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
|
42
|
+
@comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
|
43
|
+
@comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
|
44
|
+
@comp_stats[:rbh_per_reference] = @rbh_per_reference
|
45
|
+
@comp_stats[:reference_coverage] = @reference_coverage
|
46
|
+
@comp_stats[:collapse_factor] = @collapse_factor
|
47
|
+
@comp_stats[:n_chimeras] = @n_chimeras
|
48
|
+
@comp_stats[:p_chimeras] = @p_chimeras
|
61
49
|
end
|
62
50
|
|
63
51
|
def reciprocal_best_blast
|
64
|
-
crbblast = CRB_Blast.new @assembly.file, @reference.file
|
65
|
-
crbblast.run
|
52
|
+
crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
|
53
|
+
crbblast.run(1e-5, @threads, true)
|
66
54
|
crbblast
|
67
55
|
end
|
68
56
|
|
69
57
|
# coverage of contigs that have reciprocal hits
|
70
|
-
# divided by
|
71
|
-
|
72
|
-
|
73
|
-
|
58
|
+
# divided by number of reciprocal targets
|
59
|
+
def coverage crbblast
|
60
|
+
return @reference_coverage unless @reference_coverage.nil?
|
61
|
+
crbblast.reciprocals.each do |key, list|
|
62
|
+
list.each_with_index do |hit, i|
|
63
|
+
unless @reference.assembly.key? hit.target
|
64
|
+
raise "#{hit.target} not in reference"
|
65
|
+
end
|
66
|
+
@reference[hit.target].hits << hit
|
74
67
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
68
|
+
unless @assembly.assembly.key? hit.query
|
69
|
+
raise "#{hit.query} not in assembly"
|
70
|
+
end
|
71
|
+
contig = @assembly[hit.query]
|
72
|
+
contig.has_crb = true
|
73
|
+
# how much of the reference is covered by this single contig
|
74
|
+
contig.reference_coverage = hit.alnlen / hit.tlen
|
75
|
+
contig.hits << hit
|
80
76
|
end
|
81
77
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
total_coverage = 0
|
79
|
+
total_length = 0
|
80
|
+
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
81
|
+
@reference.each_value do |ref_contig|
|
82
|
+
key = ref_contig.name
|
83
|
+
list = ref_contig.hits
|
84
|
+
total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
|
85
|
+
|
86
|
+
next if list.empty? # ah this is what was breaking everything
|
86
87
|
blocks = []
|
87
88
|
target_length = 0
|
88
89
|
list.each do |hit|
|
89
90
|
target_length = hit.tlen
|
90
91
|
if crbblast.target_is_prot
|
91
92
|
target_length *= 3
|
92
|
-
start, stop = [hit.tstart
|
93
|
+
start, stop = [hit.tstart, hit.tend].minmax
|
94
|
+
start = start*3-2
|
95
|
+
stop = stop*3
|
93
96
|
else
|
94
97
|
start, stop = [hit.tstart, hit.tend].minmax
|
95
98
|
end
|
@@ -112,7 +115,8 @@ module Transrate
|
|
112
115
|
block[0] = start
|
113
116
|
block[1] = stop
|
114
117
|
found=true
|
115
|
-
|
118
|
+
elsif o == 4 # full overlap
|
119
|
+
found=true
|
116
120
|
# nothing
|
117
121
|
# elsif o == 5 || o == 6 # no overlap
|
118
122
|
|
@@ -157,28 +161,53 @@ module Transrate
|
|
157
161
|
end # each_with_index b
|
158
162
|
end # each_with_index a
|
159
163
|
# sum blocks to find total coverage
|
160
|
-
length_of_coverage=
|
161
|
-
blocks.each do |block|
|
162
|
-
if block[0] and block[1]
|
163
|
-
if block[0]>=0 and block[1]>=0
|
164
|
-
length_of_coverage += block[1] - block[0] + 1
|
165
|
-
end
|
166
|
-
else
|
167
|
-
puts "error: key = #{key}, #{blocks}"
|
168
|
-
end
|
169
|
-
end
|
170
|
-
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
164
|
+
length_of_coverage = calculate_coverage blocks
|
171
165
|
@cov ||= [0, 0, 0, 0, 0]
|
172
|
-
|
166
|
+
if target_length > 0
|
167
|
+
# puts "#{length_of_coverage} / #{target_length.to_f}"
|
168
|
+
ref_p = length_of_coverage / target_length.to_f
|
169
|
+
else
|
170
|
+
ref_p = 0
|
171
|
+
end
|
172
|
+
ref_contig.reference_coverage = ref_p
|
173
|
+
|
173
174
|
cov.each_with_index do |c, i|
|
174
|
-
if
|
175
|
+
if ref_p >= c
|
175
176
|
@cov[i] +=1
|
176
177
|
end
|
177
178
|
end
|
179
|
+
|
178
180
|
total_coverage += length_of_coverage
|
179
|
-
total_length += target_length
|
180
181
|
end
|
181
|
-
|
182
|
+
cov.each_with_index do |p, i|
|
183
|
+
@comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
|
184
|
+
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
185
|
+
@cov[i]/@reference.size.to_f
|
186
|
+
end
|
187
|
+
total_coverage / total_length.to_f
|
188
|
+
end
|
189
|
+
|
190
|
+
# Calculate the total coverage from a set of coverage blocks
|
191
|
+
def calculate_coverage blocks
|
192
|
+
coverage = 0
|
193
|
+
blocks.each do |block|
|
194
|
+
if block[0] and block[1]
|
195
|
+
if block[0]>=0 and block[1]>=0
|
196
|
+
coverage += block[1] - block[0] + 1
|
197
|
+
end
|
198
|
+
else
|
199
|
+
puts "error: key = #{key}, #{blocks}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
coverage
|
203
|
+
end
|
204
|
+
|
205
|
+
# Count reference proteins with at least one recprocal hit
|
206
|
+
def count_ref_crbbs
|
207
|
+
@n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
|
208
|
+
name, contig = entry
|
209
|
+
sum + (contig.hits.length > 0 ? 1 : 0)
|
210
|
+
end
|
182
211
|
end
|
183
212
|
|
184
213
|
def chimeras crbblast
|
@@ -210,6 +239,10 @@ module Transrate
|
|
210
239
|
end
|
211
240
|
if p/list.size.to_f >= 0.5
|
212
241
|
@n_chimeras += 1
|
242
|
+
unless @assembly.assembly.key? key
|
243
|
+
puts "key not in assembly: #{key}"
|
244
|
+
end
|
245
|
+
@assembly[key].is_chimera = true
|
213
246
|
end
|
214
247
|
end
|
215
248
|
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
@@ -267,20 +300,17 @@ module Transrate
|
|
267
300
|
end
|
268
301
|
end
|
269
302
|
|
270
|
-
|
303
|
+
# Count unique reference proteins per contig
|
304
|
+
def collapse_factor reciprocals
|
271
305
|
return @collapse_factor unless @collapse_factor.nil?
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
end
|
279
|
-
targets[target] << query
|
280
|
-
end
|
306
|
+
cf_sum = 0
|
307
|
+
reciprocals.each do |query, hits|
|
308
|
+
uniq_hits = Set.new hits.map{ |h| h.target }
|
309
|
+
cf = uniq_hits.length
|
310
|
+
@assembly[query].collapse_factor = cf
|
311
|
+
cf_sum += cf
|
281
312
|
end
|
282
|
-
|
283
|
-
sum / targets.size
|
313
|
+
cf_sum / reciprocals.size
|
284
314
|
end
|
285
315
|
|
286
316
|
end # ComparativeMetrics
|
data/lib/transrate/contig.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'forwardable'
|
2
|
-
require 'inline'
|
3
2
|
|
4
3
|
module Transrate
|
5
4
|
|
@@ -9,51 +8,105 @@ module Transrate
|
|
9
8
|
include Enumerable
|
10
9
|
extend Forwardable
|
11
10
|
def_delegators :@seq, :size, :length
|
12
|
-
attr_accessor :seq, :name
|
11
|
+
attr_accessor :seq, :name
|
12
|
+
# read-based metrics
|
13
|
+
attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
|
14
|
+
# reference-based metrics
|
15
|
+
attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
|
16
|
+
attr_accessor :hits
|
13
17
|
|
14
18
|
def initialize(seq, name: nil)
|
19
|
+
seq.seq.gsub!("\0", "") # there is probably a better fix than this
|
15
20
|
@seq = seq
|
21
|
+
@seq.data = nil # no need to store raw fasta string
|
16
22
|
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
23
|
+
@hits = []
|
24
|
+
@reference_coverage = 0
|
25
|
+
@collapse_factor = 0
|
26
|
+
@is_chimera = false
|
27
|
+
@has_crb = false
|
28
|
+
@in_bridges = 0
|
29
|
+
@mean_coverage = 0
|
17
30
|
end
|
18
31
|
|
19
32
|
def each &block
|
20
33
|
@seq.seq.each_char &block
|
21
34
|
end
|
22
35
|
|
36
|
+
# Get all metrics available for this contig
|
37
|
+
def basic_metrics
|
38
|
+
basic = {
|
39
|
+
:length => length,
|
40
|
+
:prop_gc => prop_gc,
|
41
|
+
:gc_skew => gc_skew,
|
42
|
+
:at_skew => at_skew,
|
43
|
+
:cpg_count => cpg_count,
|
44
|
+
:cpg_ratio => cpg_ratio,
|
45
|
+
:orf_length => orf_length,
|
46
|
+
:linguistic_complexity_6 => linguistic_complexity(6)
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_metrics
|
51
|
+
read = @coverage ? {
|
52
|
+
:uncovered_bases => uncovered_bases,
|
53
|
+
:mean_coverage => mean_coverage,
|
54
|
+
:in_bridges => in_bridges
|
55
|
+
} : {
|
56
|
+
:uncovered_bases => "NA",
|
57
|
+
:mean_coverage => "NA",
|
58
|
+
:in_bridges => in_bridges
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
def comparative_metrics
|
63
|
+
reference = @has_crb ? {
|
64
|
+
:has_crb => has_crb,
|
65
|
+
:collapse_factor => collapse_factor,
|
66
|
+
:reference_coverage => reference_coverage,
|
67
|
+
:is_chimera => is_chimera,
|
68
|
+
:hits => hits.map{ |h| h.target }.join(";")
|
69
|
+
} : {
|
70
|
+
:has_crb => false,
|
71
|
+
:collapse_factor => "NA",
|
72
|
+
:reference_coverage => "NA",
|
73
|
+
:is_chimera => "NA",
|
74
|
+
:hits => "NA"
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
23
78
|
# Base composition of the contig
|
79
|
+
#
|
80
|
+
# If called and the instance variable @base_composition is nil
|
81
|
+
# then call the c method to count the bases and dibases in the sequence
|
82
|
+
# then get the info out of the c array and store it in the hash
|
83
|
+
# then if it is called again just return the hash as before
|
24
84
|
def base_composition
|
25
85
|
if @base_composition
|
26
86
|
return @base_composition
|
27
87
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
key = base.downcase.to_sym
|
42
|
-
base_comp[key] += 1
|
43
|
-
if last_base
|
44
|
-
# pairs of bases
|
45
|
-
dikey = "#{last_base}#{base}".downcase.to_sym
|
46
|
-
if dibase_comp[dikey]
|
47
|
-
dibase_comp[dikey] += 1
|
48
|
-
else
|
49
|
-
dibase_comp[dikey] = 1
|
50
|
-
end
|
88
|
+
# else run the C method
|
89
|
+
composition(@seq.seq)
|
90
|
+
alphabet = ['a', 'c', 'g', 't', 'n']
|
91
|
+
@base_composition = {}
|
92
|
+
@dibase_composition={}
|
93
|
+
bases = []
|
94
|
+
dibases = []
|
95
|
+
alphabet.each do |c|
|
96
|
+
bases << "#{c}".to_sym
|
97
|
+
end
|
98
|
+
alphabet.each do |c|
|
99
|
+
alphabet.each do |d|
|
100
|
+
dibases << "#{c}#{d}".to_sym
|
51
101
|
end
|
52
|
-
last_base = base
|
53
102
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
103
|
+
bases.each_with_index do |a,i|
|
104
|
+
@base_composition[a] = base_count(i)
|
105
|
+
end
|
106
|
+
dibases.each_with_index do |a,i|
|
107
|
+
@dibase_composition[a] = dibase_count(i)
|
108
|
+
end
|
109
|
+
return @base_composition
|
57
110
|
end
|
58
111
|
|
59
112
|
# Dibase composition of the contig
|
@@ -124,89 +177,37 @@ module Transrate
|
|
124
177
|
|
125
178
|
# GC skew
|
126
179
|
def gc_skew
|
127
|
-
|
180
|
+
(bases_g - bases_c) / (bases_g + bases_c).to_f
|
128
181
|
end
|
129
182
|
|
130
183
|
# AT skew
|
131
184
|
def at_skew
|
132
|
-
|
185
|
+
(bases_a - bases_t) / (bases_a + bases_t).to_f
|
133
186
|
end
|
134
187
|
|
135
188
|
# CpG count
|
136
189
|
def cpg_count
|
137
|
-
dibase_composition[:cg]
|
190
|
+
dibase_composition[:cg] + dibase_composition[:gc]
|
138
191
|
end
|
139
192
|
|
140
|
-
# CpG (C-phosphate-G) ratio
|
193
|
+
# observed-to-expected CpG (C-phosphate-G) ratio
|
141
194
|
def cpg_ratio
|
142
|
-
dibase_composition[:cg]
|
195
|
+
r = dibase_composition[:cg] + dibase_composition[:gc]
|
196
|
+
r /= (bases_c * bases_g).to_f
|
197
|
+
r *= (length - bases_n)
|
198
|
+
return r
|
143
199
|
end
|
144
200
|
|
145
201
|
# Find the longest orf in the contig
|
146
202
|
def orf_length
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
# Inlined C longest-ORF function
|
152
|
-
inline do |builder|
|
153
|
-
builder.c <<SRC
|
154
|
-
static
|
155
|
-
void
|
156
|
-
longest_orf(VALUE _s) {
|
157
|
-
int i,sl,longest=0;
|
158
|
-
int len[6];
|
159
|
-
char * c_str;
|
160
|
-
|
161
|
-
sl = RSTRING_LEN(_s);
|
162
|
-
c_str = StringValueCStr(_s);
|
163
|
-
for (i=0;i<6;i++) {
|
164
|
-
len[i]=0;
|
165
|
-
}
|
166
|
-
for (i=0;i<sl-2;i++) {
|
167
|
-
if (c_str[i]=='T' &&
|
168
|
-
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
169
|
-
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
170
|
-
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
171
|
-
if (len[i%3] > longest) {
|
172
|
-
longest = len[i%3];
|
173
|
-
}
|
174
|
-
len[i%3]=0;
|
175
|
-
} else {
|
176
|
-
len[i%3]++;
|
177
|
-
}
|
178
|
-
if (c_str[i+2]=='A' &&
|
179
|
-
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
180
|
-
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
181
|
-
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
182
|
-
if (len[3+i%3] > longest) {
|
183
|
-
longest = len[3+i%3];
|
184
|
-
}
|
185
|
-
len[3+i%3]=0;
|
186
|
-
} else {
|
187
|
-
len[3+i%3]++;
|
188
|
-
}
|
189
|
-
}
|
190
|
-
if (len[i%3] > longest) {
|
191
|
-
longest = len[i%3];
|
192
|
-
}
|
193
|
-
if (len[3+i%3] > longest) {
|
194
|
-
longest = len[3+i%3];
|
195
|
-
}
|
196
|
-
return INT2NUM(longest);
|
197
|
-
}
|
198
|
-
SRC
|
203
|
+
return @orf_length if @orf_length
|
204
|
+
@orf_length = longest_orf(@seq.seq) # call to C
|
205
|
+
return @orf_length
|
199
206
|
end
|
200
207
|
|
201
208
|
def linguistic_complexity k
|
202
|
-
|
203
|
-
set = Set.new
|
204
|
-
(0..@seq.length-k).each do |i|
|
205
|
-
set << @seq.seq.slice(i,k).upcase # slice(start, length)
|
206
|
-
end # count how many kmers in seq
|
207
|
-
set.size / d.to_f
|
209
|
+
return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
|
208
210
|
end
|
209
|
-
|
210
211
|
end
|
211
212
|
|
212
213
|
end
|