transrate 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +1 -0
- data/LICENSE +2 -15
- data/README.md +14 -132
- data/Rakefile +19 -2
- data/bin/transrate +49 -10
- data/deps/deps.yaml +0 -10
- data/docs/transrate_logo_full.png +0 -0
- data/ext/transrate/extconf.rb +13 -0
- data/ext/transrate/transrate.c +223 -0
- data/lib/transrate.rb +1 -0
- data/lib/transrate/assembly.rb +12 -10
- data/lib/transrate/bowtie2.rb +7 -0
- data/lib/transrate/comparative_metrics.rb +103 -73
- data/lib/transrate/contig.rb +94 -93
- data/lib/transrate/contig_metrics.rb +1 -2
- data/lib/transrate/read_metrics.rb +13 -7
- data/lib/transrate/version.rb +1 -1
- data/test/helper.rb +1 -31
- data/test/test_bin.rb +99 -0
- data/test/test_bowtie.rb +12 -0
- data/test/test_comp_metrics.rb +161 -104
- data/test/test_contig.rb +62 -6
- data/test/test_contig_metrics.rb +2 -2
- data/test/test_inline.rb +2 -2
- data/test/test_transrater.rb +1 -1
- data/transrate.gemspec +5 -4
- metadata +40 -22
data/lib/transrate.rb
CHANGED
data/lib/transrate/assembly.rb
CHANGED
@@ -26,7 +26,7 @@ module Transrate
|
|
26
26
|
|
27
27
|
include Enumerable
|
28
28
|
extend Forwardable
|
29
|
-
def_delegators :@assembly, :each, :<<, :size, :length
|
29
|
+
def_delegators :@assembly, :each, :each_value, :<<, :size, :length, :[]
|
30
30
|
|
31
31
|
attr_accessor :file
|
32
32
|
attr_reader :assembly
|
@@ -43,11 +43,12 @@ module Transrate
|
|
43
43
|
unless File.exist? @file
|
44
44
|
raise IOError.new "Assembly file doesn't exist: #{@file}"
|
45
45
|
end
|
46
|
-
@assembly =
|
46
|
+
@assembly = {}
|
47
47
|
@n_bases = 0
|
48
48
|
Bio::FastaFormat.open(file).each do |entry|
|
49
49
|
@n_bases += entry.length
|
50
|
-
|
50
|
+
contig = Contig.new(entry)
|
51
|
+
@assembly[contig.name] = contig
|
51
52
|
end
|
52
53
|
@contig_metrics = ContigMetrics.new self
|
53
54
|
end
|
@@ -78,7 +79,7 @@ module Transrate
|
|
78
79
|
# @return [Hash] basic statistics about the assembly
|
79
80
|
def basic_stats threads=1
|
80
81
|
return @basic_stats if @basic_stats
|
81
|
-
bin = @assembly.
|
82
|
+
bin = @assembly.values
|
82
83
|
@basic_stats = basic_bin_stats bin
|
83
84
|
@basic_stats
|
84
85
|
end # basic_stats
|
@@ -103,7 +104,7 @@ module Transrate
|
|
103
104
|
# representing contigs in the assembly
|
104
105
|
|
105
106
|
def basic_bin_stats bin
|
106
|
-
|
107
|
+
|
107
108
|
# cumulative length is a float so we can divide it
|
108
109
|
# accurately later to get the mean length
|
109
110
|
cumulative_length = 0.0
|
@@ -194,7 +195,7 @@ module Transrate
|
|
194
195
|
covfile = Samtools.coverage bam
|
195
196
|
# get an assembly enumerator
|
196
197
|
assembly_enum = @assembly.to_enum
|
197
|
-
contig = assembly_enum.next
|
198
|
+
contig_name, contig = assembly_enum.next
|
198
199
|
# precreate an array of the correct size to contain
|
199
200
|
# coverage. this is necessary because samtools mpileup
|
200
201
|
# doesn't print a result line for bases with 0 coverage
|
@@ -209,12 +210,13 @@ module Transrate
|
|
209
210
|
break
|
210
211
|
end
|
211
212
|
# extract the columns
|
212
|
-
name
|
213
|
-
|
214
|
-
|
213
|
+
name = Bio::FastaDefline.new(cols[name_i]).entry_id
|
214
|
+
pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
|
215
|
+
unless contig_name == name
|
216
|
+
while contig_name != name
|
215
217
|
begin
|
216
218
|
block.call(contig, contig.coverage)
|
217
|
-
contig = assembly_enum.next
|
219
|
+
contig_name, contig = assembly_enum.next
|
218
220
|
contig.coverage = Array.new(contig.length, 0)
|
219
221
|
rescue StopIteration => stop_error
|
220
222
|
logger.error 'reached the end of assembly enumerator while ' +
|
data/lib/transrate/bowtie2.rb
CHANGED
@@ -49,6 +49,9 @@ module Transrate
|
|
49
49
|
# run bowtie
|
50
50
|
runner = Cmd.new bowtiecmd
|
51
51
|
runner.run
|
52
|
+
if !runner.status.success?
|
53
|
+
raise Bowtie2Error.new("Bowtie2 failed\n#{runner.stderr}")
|
54
|
+
end
|
52
55
|
end
|
53
56
|
@sam
|
54
57
|
end
|
@@ -59,6 +62,10 @@ module Transrate
|
|
59
62
|
cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
|
60
63
|
runner = Cmd.new cmd
|
61
64
|
runner.run
|
65
|
+
if !runner.status.success?
|
66
|
+
msg = "Failed to build Bowtie2 index\n#{runner.stderr}"
|
67
|
+
raise Bowtie2Error.new(msg)
|
68
|
+
end
|
62
69
|
end
|
63
70
|
@index_built = true
|
64
71
|
end
|
@@ -10,86 +10,89 @@ module Transrate
|
|
10
10
|
attr_reader :reciprocal_hits
|
11
11
|
attr_reader :has_run
|
12
12
|
attr_reader :reference_coverage
|
13
|
+
attr_reader :comp_stats
|
13
14
|
attr_reader :n_chimeras, :p_chimeras
|
14
15
|
|
15
16
|
def initialize assembly, reference, threads
|
16
17
|
@assembly = assembly
|
17
18
|
@reference = reference
|
18
19
|
@threads = threads
|
20
|
+
@comp_stats = Hash.new
|
19
21
|
end
|
20
22
|
|
21
23
|
def run
|
22
24
|
@crbblast = reciprocal_best_blast
|
23
|
-
@
|
24
|
-
@collapse_factor = collapse_factor @crbblast.
|
25
|
+
@reference_coverage = coverage @crbblast
|
26
|
+
@collapse_factor = collapse_factor @crbblast.reciprocals
|
25
27
|
@reciprocal_hits = @crbblast.size
|
26
28
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
27
|
-
@reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
|
28
|
-
@rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
|
29
29
|
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
30
|
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
|
+
count_ref_crbbs
|
31
32
|
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
32
33
|
chimeras @crbblast
|
34
|
+
self.run_comp_stats
|
33
35
|
@has_run = true
|
34
36
|
end
|
35
37
|
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
:collapse_factor => @collapse_factor,
|
48
|
-
:n_chimeras => @n_chimeras,
|
49
|
-
:p_chimeras => @p_chimeras,
|
50
|
-
:cov25 => @cov[0],
|
51
|
-
:cov50 => @cov[1],
|
52
|
-
:cov75 => @cov[2],
|
53
|
-
:cov85 => @cov[3],
|
54
|
-
:cov95 => @cov[4],
|
55
|
-
:p_cov25 => @cov[0]/@reference.size.to_f,
|
56
|
-
:p_cov50 => @cov[1]/@reference.size.to_f,
|
57
|
-
:p_cov75 => @cov[2]/@reference.size.to_f,
|
58
|
-
:p_cov85 => @cov[3]/@reference.size.to_f,
|
59
|
-
:p_cov95 => @cov[4]/@reference.size.to_f
|
60
|
-
}
|
38
|
+
def run_comp_stats
|
39
|
+
@comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
|
40
|
+
@comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
|
41
|
+
@comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
|
42
|
+
@comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
|
43
|
+
@comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
|
44
|
+
@comp_stats[:rbh_per_reference] = @rbh_per_reference
|
45
|
+
@comp_stats[:reference_coverage] = @reference_coverage
|
46
|
+
@comp_stats[:collapse_factor] = @collapse_factor
|
47
|
+
@comp_stats[:n_chimeras] = @n_chimeras
|
48
|
+
@comp_stats[:p_chimeras] = @p_chimeras
|
61
49
|
end
|
62
50
|
|
63
51
|
def reciprocal_best_blast
|
64
|
-
crbblast = CRB_Blast.new @assembly.file, @reference.file
|
65
|
-
crbblast.run
|
52
|
+
crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
|
53
|
+
crbblast.run(1e-5, @threads, true)
|
66
54
|
crbblast
|
67
55
|
end
|
68
56
|
|
69
57
|
# coverage of contigs that have reciprocal hits
|
70
|
-
# divided by
|
71
|
-
|
72
|
-
|
73
|
-
|
58
|
+
# divided by number of reciprocal targets
|
59
|
+
def coverage crbblast
|
60
|
+
return @reference_coverage unless @reference_coverage.nil?
|
61
|
+
crbblast.reciprocals.each do |key, list|
|
62
|
+
list.each_with_index do |hit, i|
|
63
|
+
unless @reference.assembly.key? hit.target
|
64
|
+
raise "#{hit.target} not in reference"
|
65
|
+
end
|
66
|
+
@reference[hit.target].hits << hit
|
74
67
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
68
|
+
unless @assembly.assembly.key? hit.query
|
69
|
+
raise "#{hit.query} not in assembly"
|
70
|
+
end
|
71
|
+
contig = @assembly[hit.query]
|
72
|
+
contig.has_crb = true
|
73
|
+
# how much of the reference is covered by this single contig
|
74
|
+
contig.reference_coverage = hit.alnlen / hit.tlen
|
75
|
+
contig.hits << hit
|
80
76
|
end
|
81
77
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
total_coverage = 0
|
79
|
+
total_length = 0
|
80
|
+
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
81
|
+
@reference.each_value do |ref_contig|
|
82
|
+
key = ref_contig.name
|
83
|
+
list = ref_contig.hits
|
84
|
+
total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
|
85
|
+
|
86
|
+
next if list.empty? # ah this is what was breaking everything
|
86
87
|
blocks = []
|
87
88
|
target_length = 0
|
88
89
|
list.each do |hit|
|
89
90
|
target_length = hit.tlen
|
90
91
|
if crbblast.target_is_prot
|
91
92
|
target_length *= 3
|
92
|
-
start, stop = [hit.tstart
|
93
|
+
start, stop = [hit.tstart, hit.tend].minmax
|
94
|
+
start = start*3-2
|
95
|
+
stop = stop*3
|
93
96
|
else
|
94
97
|
start, stop = [hit.tstart, hit.tend].minmax
|
95
98
|
end
|
@@ -112,7 +115,8 @@ module Transrate
|
|
112
115
|
block[0] = start
|
113
116
|
block[1] = stop
|
114
117
|
found=true
|
115
|
-
|
118
|
+
elsif o == 4 # full overlap
|
119
|
+
found=true
|
116
120
|
# nothing
|
117
121
|
# elsif o == 5 || o == 6 # no overlap
|
118
122
|
|
@@ -157,28 +161,53 @@ module Transrate
|
|
157
161
|
end # each_with_index b
|
158
162
|
end # each_with_index a
|
159
163
|
# sum blocks to find total coverage
|
160
|
-
length_of_coverage=
|
161
|
-
blocks.each do |block|
|
162
|
-
if block[0] and block[1]
|
163
|
-
if block[0]>=0 and block[1]>=0
|
164
|
-
length_of_coverage += block[1] - block[0] + 1
|
165
|
-
end
|
166
|
-
else
|
167
|
-
puts "error: key = #{key}, #{blocks}"
|
168
|
-
end
|
169
|
-
end
|
170
|
-
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
164
|
+
length_of_coverage = calculate_coverage blocks
|
171
165
|
@cov ||= [0, 0, 0, 0, 0]
|
172
|
-
|
166
|
+
if target_length > 0
|
167
|
+
# puts "#{length_of_coverage} / #{target_length.to_f}"
|
168
|
+
ref_p = length_of_coverage / target_length.to_f
|
169
|
+
else
|
170
|
+
ref_p = 0
|
171
|
+
end
|
172
|
+
ref_contig.reference_coverage = ref_p
|
173
|
+
|
173
174
|
cov.each_with_index do |c, i|
|
174
|
-
if
|
175
|
+
if ref_p >= c
|
175
176
|
@cov[i] +=1
|
176
177
|
end
|
177
178
|
end
|
179
|
+
|
178
180
|
total_coverage += length_of_coverage
|
179
|
-
total_length += target_length
|
180
181
|
end
|
181
|
-
|
182
|
+
cov.each_with_index do |p, i|
|
183
|
+
@comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
|
184
|
+
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
185
|
+
@cov[i]/@reference.size.to_f
|
186
|
+
end
|
187
|
+
total_coverage / total_length.to_f
|
188
|
+
end
|
189
|
+
|
190
|
+
# Calculate the total coverage from a set of coverage blocks
|
191
|
+
def calculate_coverage blocks
|
192
|
+
coverage = 0
|
193
|
+
blocks.each do |block|
|
194
|
+
if block[0] and block[1]
|
195
|
+
if block[0]>=0 and block[1]>=0
|
196
|
+
coverage += block[1] - block[0] + 1
|
197
|
+
end
|
198
|
+
else
|
199
|
+
puts "error: key = #{key}, #{blocks}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
coverage
|
203
|
+
end
|
204
|
+
|
205
|
+
# Count reference proteins with at least one recprocal hit
|
206
|
+
def count_ref_crbbs
|
207
|
+
@n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
|
208
|
+
name, contig = entry
|
209
|
+
sum + (contig.hits.length > 0 ? 1 : 0)
|
210
|
+
end
|
182
211
|
end
|
183
212
|
|
184
213
|
def chimeras crbblast
|
@@ -210,6 +239,10 @@ module Transrate
|
|
210
239
|
end
|
211
240
|
if p/list.size.to_f >= 0.5
|
212
241
|
@n_chimeras += 1
|
242
|
+
unless @assembly.assembly.key? key
|
243
|
+
puts "key not in assembly: #{key}"
|
244
|
+
end
|
245
|
+
@assembly[key].is_chimera = true
|
213
246
|
end
|
214
247
|
end
|
215
248
|
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
@@ -267,20 +300,17 @@ module Transrate
|
|
267
300
|
end
|
268
301
|
end
|
269
302
|
|
270
|
-
|
303
|
+
# Count unique reference proteins per contig
|
304
|
+
def collapse_factor reciprocals
|
271
305
|
return @collapse_factor unless @collapse_factor.nil?
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
end
|
279
|
-
targets[target] << query
|
280
|
-
end
|
306
|
+
cf_sum = 0
|
307
|
+
reciprocals.each do |query, hits|
|
308
|
+
uniq_hits = Set.new hits.map{ |h| h.target }
|
309
|
+
cf = uniq_hits.length
|
310
|
+
@assembly[query].collapse_factor = cf
|
311
|
+
cf_sum += cf
|
281
312
|
end
|
282
|
-
|
283
|
-
sum / targets.size
|
313
|
+
cf_sum / reciprocals.size
|
284
314
|
end
|
285
315
|
|
286
316
|
end # ComparativeMetrics
|
data/lib/transrate/contig.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'forwardable'
|
2
|
-
require 'inline'
|
3
2
|
|
4
3
|
module Transrate
|
5
4
|
|
@@ -9,51 +8,105 @@ module Transrate
|
|
9
8
|
include Enumerable
|
10
9
|
extend Forwardable
|
11
10
|
def_delegators :@seq, :size, :length
|
12
|
-
attr_accessor :seq, :name
|
11
|
+
attr_accessor :seq, :name
|
12
|
+
# read-based metrics
|
13
|
+
attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
|
14
|
+
# reference-based metrics
|
15
|
+
attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
|
16
|
+
attr_accessor :hits
|
13
17
|
|
14
18
|
def initialize(seq, name: nil)
|
19
|
+
seq.seq.gsub!("\0", "") # there is probably a better fix than this
|
15
20
|
@seq = seq
|
21
|
+
@seq.data = nil # no need to store raw fasta string
|
16
22
|
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
23
|
+
@hits = []
|
24
|
+
@reference_coverage = 0
|
25
|
+
@collapse_factor = 0
|
26
|
+
@is_chimera = false
|
27
|
+
@has_crb = false
|
28
|
+
@in_bridges = 0
|
29
|
+
@mean_coverage = 0
|
17
30
|
end
|
18
31
|
|
19
32
|
def each &block
|
20
33
|
@seq.seq.each_char &block
|
21
34
|
end
|
22
35
|
|
36
|
+
# Get all metrics available for this contig
|
37
|
+
def basic_metrics
|
38
|
+
basic = {
|
39
|
+
:length => length,
|
40
|
+
:prop_gc => prop_gc,
|
41
|
+
:gc_skew => gc_skew,
|
42
|
+
:at_skew => at_skew,
|
43
|
+
:cpg_count => cpg_count,
|
44
|
+
:cpg_ratio => cpg_ratio,
|
45
|
+
:orf_length => orf_length,
|
46
|
+
:linguistic_complexity_6 => linguistic_complexity(6)
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_metrics
|
51
|
+
read = @coverage ? {
|
52
|
+
:uncovered_bases => uncovered_bases,
|
53
|
+
:mean_coverage => mean_coverage,
|
54
|
+
:in_bridges => in_bridges
|
55
|
+
} : {
|
56
|
+
:uncovered_bases => "NA",
|
57
|
+
:mean_coverage => "NA",
|
58
|
+
:in_bridges => in_bridges
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
def comparative_metrics
|
63
|
+
reference = @has_crb ? {
|
64
|
+
:has_crb => has_crb,
|
65
|
+
:collapse_factor => collapse_factor,
|
66
|
+
:reference_coverage => reference_coverage,
|
67
|
+
:is_chimera => is_chimera,
|
68
|
+
:hits => hits.map{ |h| h.target }.join(";")
|
69
|
+
} : {
|
70
|
+
:has_crb => false,
|
71
|
+
:collapse_factor => "NA",
|
72
|
+
:reference_coverage => "NA",
|
73
|
+
:is_chimera => "NA",
|
74
|
+
:hits => "NA"
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
23
78
|
# Base composition of the contig
|
79
|
+
#
|
80
|
+
# If called and the instance variable @base_composition is nil
|
81
|
+
# then call the c method to count the bases and dibases in the sequence
|
82
|
+
# then get the info out of the c array and store it in the hash
|
83
|
+
# then if it is called again just return the hash as before
|
24
84
|
def base_composition
|
25
85
|
if @base_composition
|
26
86
|
return @base_composition
|
27
87
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
key = base.downcase.to_sym
|
42
|
-
base_comp[key] += 1
|
43
|
-
if last_base
|
44
|
-
# pairs of bases
|
45
|
-
dikey = "#{last_base}#{base}".downcase.to_sym
|
46
|
-
if dibase_comp[dikey]
|
47
|
-
dibase_comp[dikey] += 1
|
48
|
-
else
|
49
|
-
dibase_comp[dikey] = 1
|
50
|
-
end
|
88
|
+
# else run the C method
|
89
|
+
composition(@seq.seq)
|
90
|
+
alphabet = ['a', 'c', 'g', 't', 'n']
|
91
|
+
@base_composition = {}
|
92
|
+
@dibase_composition={}
|
93
|
+
bases = []
|
94
|
+
dibases = []
|
95
|
+
alphabet.each do |c|
|
96
|
+
bases << "#{c}".to_sym
|
97
|
+
end
|
98
|
+
alphabet.each do |c|
|
99
|
+
alphabet.each do |d|
|
100
|
+
dibases << "#{c}#{d}".to_sym
|
51
101
|
end
|
52
|
-
last_base = base
|
53
102
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
103
|
+
bases.each_with_index do |a,i|
|
104
|
+
@base_composition[a] = base_count(i)
|
105
|
+
end
|
106
|
+
dibases.each_with_index do |a,i|
|
107
|
+
@dibase_composition[a] = dibase_count(i)
|
108
|
+
end
|
109
|
+
return @base_composition
|
57
110
|
end
|
58
111
|
|
59
112
|
# Dibase composition of the contig
|
@@ -124,89 +177,37 @@ module Transrate
|
|
124
177
|
|
125
178
|
# GC skew
|
126
179
|
def gc_skew
|
127
|
-
|
180
|
+
(bases_g - bases_c) / (bases_g + bases_c).to_f
|
128
181
|
end
|
129
182
|
|
130
183
|
# AT skew
|
131
184
|
def at_skew
|
132
|
-
|
185
|
+
(bases_a - bases_t) / (bases_a + bases_t).to_f
|
133
186
|
end
|
134
187
|
|
135
188
|
# CpG count
|
136
189
|
def cpg_count
|
137
|
-
dibase_composition[:cg]
|
190
|
+
dibase_composition[:cg] + dibase_composition[:gc]
|
138
191
|
end
|
139
192
|
|
140
|
-
# CpG (C-phosphate-G) ratio
|
193
|
+
# observed-to-expected CpG (C-phosphate-G) ratio
|
141
194
|
def cpg_ratio
|
142
|
-
dibase_composition[:cg]
|
195
|
+
r = dibase_composition[:cg] + dibase_composition[:gc]
|
196
|
+
r /= (bases_c * bases_g).to_f
|
197
|
+
r *= (length - bases_n)
|
198
|
+
return r
|
143
199
|
end
|
144
200
|
|
145
201
|
# Find the longest orf in the contig
|
146
202
|
def orf_length
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
# Inlined C longest-ORF function
|
152
|
-
inline do |builder|
|
153
|
-
builder.c <<SRC
|
154
|
-
static
|
155
|
-
void
|
156
|
-
longest_orf(VALUE _s) {
|
157
|
-
int i,sl,longest=0;
|
158
|
-
int len[6];
|
159
|
-
char * c_str;
|
160
|
-
|
161
|
-
sl = RSTRING_LEN(_s);
|
162
|
-
c_str = StringValueCStr(_s);
|
163
|
-
for (i=0;i<6;i++) {
|
164
|
-
len[i]=0;
|
165
|
-
}
|
166
|
-
for (i=0;i<sl-2;i++) {
|
167
|
-
if (c_str[i]=='T' &&
|
168
|
-
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
169
|
-
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
170
|
-
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
171
|
-
if (len[i%3] > longest) {
|
172
|
-
longest = len[i%3];
|
173
|
-
}
|
174
|
-
len[i%3]=0;
|
175
|
-
} else {
|
176
|
-
len[i%3]++;
|
177
|
-
}
|
178
|
-
if (c_str[i+2]=='A' &&
|
179
|
-
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
180
|
-
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
181
|
-
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
182
|
-
if (len[3+i%3] > longest) {
|
183
|
-
longest = len[3+i%3];
|
184
|
-
}
|
185
|
-
len[3+i%3]=0;
|
186
|
-
} else {
|
187
|
-
len[3+i%3]++;
|
188
|
-
}
|
189
|
-
}
|
190
|
-
if (len[i%3] > longest) {
|
191
|
-
longest = len[i%3];
|
192
|
-
}
|
193
|
-
if (len[3+i%3] > longest) {
|
194
|
-
longest = len[3+i%3];
|
195
|
-
}
|
196
|
-
return INT2NUM(longest);
|
197
|
-
}
|
198
|
-
SRC
|
203
|
+
return @orf_length if @orf_length
|
204
|
+
@orf_length = longest_orf(@seq.seq) # call to C
|
205
|
+
return @orf_length
|
199
206
|
end
|
200
207
|
|
201
208
|
def linguistic_complexity k
|
202
|
-
|
203
|
-
set = Set.new
|
204
|
-
(0..@seq.length-k).each do |i|
|
205
|
-
set << @seq.seq.slice(i,k).upcase # slice(start, length)
|
206
|
-
end # count how many kmers in seq
|
207
|
-
set.size / d.to_f
|
209
|
+
return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
|
208
210
|
end
|
209
|
-
|
210
211
|
end
|
211
212
|
|
212
213
|
end
|