transrate 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +16 -1
- data/.travis.yml +8 -0
- data/README.md +45 -43
- data/Rakefile +36 -0
- data/bin/transrate +98 -50
- data/deps/deps.yaml +55 -0
- data/lib/transrate.rb +19 -4
- data/lib/transrate/assembly.rb +93 -182
- data/lib/transrate/bowtie2.rb +37 -13
- data/lib/transrate/cmd.rb +19 -0
- data/lib/transrate/comparative_metrics.rb +239 -19
- data/lib/transrate/contig.rb +212 -0
- data/lib/transrate/contig_metrics.rb +76 -0
- data/lib/transrate/read_metrics.rb +83 -41
- data/lib/transrate/samtools.rb +73 -0
- data/lib/transrate/transrater.rb +31 -11
- data/lib/transrate/version.rb +1 -1
- data/test/data/150uncovered.l.fq +892 -0
- data/test/data/150uncovered.r.fq +892 -0
- data/test/data/Os.protein.2.fa +95 -0
- data/test/data/Os.protein.fa +199 -0
- data/test/data/assembly.2.fa +26 -0
- data/test/{assembly.fasta → data/assembly.fasta} +0 -0
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/data/sorghum_transcript.fa +4 -0
- data/test/data/tiny.sam +4 -0
- data/test/helper.rb +33 -2
- data/test/test_bowtie.rb +54 -0
- data/test/test_cmd.rb +15 -0
- data/test/test_comp_metrics.rb +177 -0
- data/test/test_contig.rb +61 -0
- data/test/test_contig_metrics.rb +50 -0
- data/test/test_inline.rb +10 -9
- data/test/test_read_metrics.rb +68 -0
- data/test/test_samtools.rb +22 -0
- data/test/test_transrate.rb +40 -0
- data/test/test_transrater.rb +68 -0
- data/transrate.gemspec +16 -10
- metadata +232 -57
- data/lib/transrate/express.rb +0 -37
- data/lib/transrate/log.rb +0 -16
- data/lib/transrate/rb_hit.rb +0 -33
- data/lib/transrate/reciprocal_annotation.rb +0 -105
- data/lib/transrate/usearch.rb +0 -66
- data/test/test_test.rb +0 -41
data/lib/transrate/bowtie2.rb
CHANGED
@@ -1,42 +1,66 @@
|
|
1
1
|
module Transrate
|
2
2
|
|
3
|
+
class Bowtie2Error < StandardError
|
4
|
+
end
|
5
|
+
|
3
6
|
class Bowtie2
|
4
7
|
|
5
8
|
require 'which'
|
6
9
|
include Which
|
7
10
|
|
11
|
+
attr_reader :index_name, :sam
|
12
|
+
|
8
13
|
def initialize
|
9
14
|
bowtie2_path = which('bowtie2')
|
10
|
-
|
15
|
+
if bowtie2_path.empty?
|
16
|
+
raise Bowtie2Error.new("could not find bowtie2 in the path")
|
17
|
+
end
|
11
18
|
@bowtie2 = bowtie2_path.first
|
12
19
|
bowtie2_build_path = which('bowtie2-build')
|
13
|
-
|
20
|
+
if bowtie2_build_path.empty?
|
21
|
+
raise Bowtie2Error.new("could not find bowtie2-build in the path")
|
22
|
+
end
|
14
23
|
@bowtie2_build = bowtie2_build_path.first
|
24
|
+
@index_built = false
|
25
|
+
@index_name = ""
|
15
26
|
end
|
16
27
|
|
17
|
-
def map_reads
|
28
|
+
def map_reads(file, left,
|
29
|
+
right, insertsize: 200,
|
30
|
+
insertsd: 50, outputname: nil,
|
31
|
+
threads: 8)
|
32
|
+
raise Bowtie2Error.new("Index not built") if !@index_built
|
18
33
|
lbase = File.basename(left)
|
19
34
|
rbase = File.basename(right)
|
20
|
-
|
35
|
+
index = File.basename(@index_name)
|
36
|
+
@sam = File.expand_path("#{lbase}.#{rbase}.#{index}.sam")
|
21
37
|
realistic_dist = insertsize + (3 * insertsd)
|
22
|
-
unless File.exists?
|
38
|
+
unless File.exists? @sam
|
23
39
|
# construct bowtie command
|
24
|
-
bowtiecmd = "#{@bowtie2} --very-sensitive
|
25
|
-
bowtiecmd += "
|
26
|
-
bowtiecmd += "
|
40
|
+
bowtiecmd = "#{@bowtie2} --very-sensitive"
|
41
|
+
bowtiecmd += " -p #{threads} -X #{realistic_dist}"
|
42
|
+
bowtiecmd += " --quiet --no-unal"
|
43
|
+
bowtiecmd += " --seed 1337"
|
44
|
+
bowtiecmd += " -x #{@index_name}"
|
45
|
+
bowtiecmd += " -1 #{left}"
|
27
46
|
# paired end?
|
28
47
|
bowtiecmd += " -2 #{right}" if right
|
29
|
-
bowtiecmd += "
|
48
|
+
bowtiecmd += " -S #{@sam}"
|
30
49
|
# run bowtie
|
31
|
-
|
50
|
+
runner = Cmd.new bowtiecmd
|
51
|
+
runner.run
|
32
52
|
end
|
33
|
-
|
53
|
+
@sam
|
34
54
|
end
|
35
55
|
|
36
56
|
def build_index file
|
37
|
-
|
38
|
-
|
57
|
+
@index_name = File.basename(file).split(".")[0..-2].join(".")
|
58
|
+
unless File.exists?(@index_name + '.1.bt2')
|
59
|
+
cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
|
60
|
+
runner = Cmd.new cmd
|
61
|
+
runner.run
|
39
62
|
end
|
63
|
+
@index_built = true
|
40
64
|
end
|
41
65
|
|
42
66
|
end # Bowtie2
|
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'set'
|
2
|
+
require 'crb-blast'
|
2
3
|
|
3
4
|
module Transrate
|
4
|
-
|
5
|
+
|
5
6
|
class ComparativeMetrics
|
6
7
|
|
7
8
|
attr_reader :rbh_per_contig
|
@@ -9,21 +10,26 @@ module Transrate
|
|
9
10
|
attr_reader :reciprocal_hits
|
10
11
|
attr_reader :has_run
|
11
12
|
attr_reader :reference_coverage
|
13
|
+
attr_reader :n_chimeras, :p_chimeras
|
12
14
|
|
13
|
-
def initialize assembly, reference
|
15
|
+
def initialize assembly, reference, threads
|
14
16
|
@assembly = assembly
|
15
17
|
@reference = reference
|
16
|
-
@
|
18
|
+
@threads = threads
|
17
19
|
end
|
18
20
|
|
19
21
|
def run
|
20
|
-
|
21
|
-
@ortholog_hit_ratio = ortholog_hit_ratio
|
22
|
-
@collapse_factor = collapse_factor @
|
23
|
-
@reciprocal_hits =
|
22
|
+
@crbblast = reciprocal_best_blast
|
23
|
+
@ortholog_hit_ratio = ortholog_hit_ratio @crbblast
|
24
|
+
@collapse_factor = collapse_factor @crbblast.target_results
|
25
|
+
@reciprocal_hits = @crbblast.size
|
24
26
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
25
27
|
@reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
|
26
28
|
@rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
|
29
|
+
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
|
+
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
|
+
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
32
|
+
chimeras @crbblast
|
27
33
|
@has_run = true
|
28
34
|
end
|
29
35
|
|
@@ -31,33 +37,247 @@ module Transrate
|
|
31
37
|
{
|
32
38
|
:reciprocal_hits => @reciprocal_hits,
|
33
39
|
:rbh_per_contig => @rbh_per_contig,
|
40
|
+
:p_contigs_with_recip => @p_contigs_with_recip,
|
41
|
+
:n_contigs_with_recip => @n_contigs_with_recip,
|
42
|
+
:p_refs_with_recip => @p_refs_with_recip,
|
43
|
+
:n_refs_with_recip => @n_refs_with_recip,
|
34
44
|
:rbh_per_reference => @rbh_per_reference,
|
35
45
|
:reference_coverage => @reference_coverage,
|
36
46
|
:ortholog_hit_ratio => @ortholog_hit_ratio,
|
37
|
-
:collapse_factor => @collapse_factor
|
47
|
+
:collapse_factor => @collapse_factor,
|
48
|
+
:n_chimeras => @n_chimeras,
|
49
|
+
:p_chimeras => @p_chimeras,
|
50
|
+
:cov25 => @cov[0],
|
51
|
+
:cov50 => @cov[1],
|
52
|
+
:cov75 => @cov[2],
|
53
|
+
:cov85 => @cov[3],
|
54
|
+
:cov95 => @cov[4],
|
55
|
+
:p_cov25 => @cov[0]/@reference.size.to_f,
|
56
|
+
:p_cov50 => @cov[1]/@reference.size.to_f,
|
57
|
+
:p_cov75 => @cov[2]/@reference.size.to_f,
|
58
|
+
:p_cov85 => @cov[3]/@reference.size.to_f,
|
59
|
+
:p_cov95 => @cov[4]/@reference.size.to_f
|
38
60
|
}
|
39
61
|
end
|
40
62
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
63
|
+
def reciprocal_best_blast
|
64
|
+
crbblast = CRB_Blast.new @assembly.file, @reference.file
|
65
|
+
crbblast.run 1e-5, @threads
|
66
|
+
crbblast
|
44
67
|
end
|
45
68
|
|
46
|
-
|
69
|
+
# coverage of contigs that have reciprocal hits
|
70
|
+
# divided by
|
71
|
+
# number of reciprocal targets
|
72
|
+
def ortholog_hit_ratio crbblast
|
47
73
|
return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
|
48
|
-
|
49
|
-
|
74
|
+
|
75
|
+
targets = Hash.new
|
76
|
+
crbblast.reciprocals.each_pair do |key, list|
|
77
|
+
list.each do |hit|
|
78
|
+
targets[hit.target] ||= [] # if key doesn't exist add it with a []
|
79
|
+
targets[hit.target] << hit
|
80
|
+
end
|
81
|
+
end
|
82
|
+
@n_refs_with_recip = targets.size
|
83
|
+
total_coverage=0
|
84
|
+
total_length=0
|
85
|
+
targets.each_pair do |key, list|
|
86
|
+
blocks = []
|
87
|
+
target_length = 0
|
88
|
+
list.each do |hit|
|
89
|
+
target_length = hit.tlen
|
90
|
+
if crbblast.target_is_prot
|
91
|
+
target_length *= 3
|
92
|
+
start, stop = [hit.tstart*3, hit.tend*3].minmax
|
93
|
+
else
|
94
|
+
start, stop = [hit.tstart, hit.tend].minmax
|
95
|
+
end
|
96
|
+
if blocks.empty?
|
97
|
+
blocks << [start, stop]
|
98
|
+
else
|
99
|
+
found=false
|
100
|
+
blocks.each do |block|
|
101
|
+
# if query overlaps with any block extend that block
|
102
|
+
o = overlap(block[0], block[1], start, stop)
|
103
|
+
if o == 0 # perfect overlap
|
104
|
+
found=true
|
105
|
+
elsif o == 1 # partial overlap
|
106
|
+
block[0] = start
|
107
|
+
found=true
|
108
|
+
elsif o == 2 # partial overlap
|
109
|
+
block[1] = stop
|
110
|
+
found=true
|
111
|
+
elsif o == 3 # full overlap
|
112
|
+
block[0] = start
|
113
|
+
block[1] = stop
|
114
|
+
found=true
|
115
|
+
# elsif o == 4 # full overlap
|
116
|
+
# nothing
|
117
|
+
# elsif o == 5 || o == 6 # no overlap
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
121
|
+
if !found
|
122
|
+
blocks << [start, stop]
|
123
|
+
end
|
124
|
+
# if any blocks now overlap then extend one block and remove
|
125
|
+
# the other
|
126
|
+
end
|
127
|
+
end
|
128
|
+
blocks.each_with_index do |block_a,a|
|
129
|
+
blocks.each_with_index do |block_b,b|
|
130
|
+
if a!=b
|
131
|
+
o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
|
132
|
+
if o == 0 # perfect overlap
|
133
|
+
block_b[0]=-1
|
134
|
+
block_b[1]=-1
|
135
|
+
elsif o == 1 # partial overlap
|
136
|
+
block_a[0] = block_b[0]
|
137
|
+
block_b[0] = -1
|
138
|
+
block_b[1] = -1
|
139
|
+
elsif o == 2 # partial overlap
|
140
|
+
block_a[1] = block_b[1]
|
141
|
+
block_b[0] = -1
|
142
|
+
block_b[1] = -1
|
143
|
+
elsif o == 3 # full overlap
|
144
|
+
block_a[0] = block_b[0]
|
145
|
+
block_a[1] = block_b[1]
|
146
|
+
block_b[0] = -1
|
147
|
+
block_b[1] = -1
|
148
|
+
elsif o == 4 # full overlap
|
149
|
+
block_b[0] = -1
|
150
|
+
block_b[1] = -1
|
151
|
+
# elsif o == 5 || o == 6# no overlap
|
152
|
+
# do nothing
|
153
|
+
# elsif # no overlap
|
154
|
+
# do nothing
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end # each_with_index b
|
158
|
+
end # each_with_index a
|
159
|
+
# sum blocks to find total coverage
|
160
|
+
length_of_coverage=0
|
161
|
+
blocks.each do |block|
|
162
|
+
if block[0] and block[1]
|
163
|
+
if block[0]>=0 and block[1]>=0
|
164
|
+
length_of_coverage += block[1] - block[0] + 1
|
165
|
+
end
|
166
|
+
else
|
167
|
+
puts "error: key = #{key}, #{blocks}"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
171
|
+
@cov ||= [0, 0, 0, 0, 0]
|
172
|
+
p = length_of_coverage / target_length.to_f
|
173
|
+
cov.each_with_index do |c, i|
|
174
|
+
if p >= c
|
175
|
+
@cov[i] +=1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
total_coverage += length_of_coverage
|
179
|
+
total_length += target_length
|
180
|
+
end
|
181
|
+
return ortholog_hit_ratio = total_coverage / total_length.to_f
|
182
|
+
end
|
183
|
+
|
184
|
+
def chimeras crbblast
|
185
|
+
@n_chimeras = 0
|
186
|
+
crbblast.reciprocals.each_pair do |key, list|
|
187
|
+
p = 0
|
188
|
+
list.each_with_index do |a, i|
|
189
|
+
list.each_with_index do |b, j|
|
190
|
+
if j>i
|
191
|
+
if a.target == b.target
|
192
|
+
astart, astop = [a.tstart, a.tend].minmax
|
193
|
+
bstart, bstop = [b.tstart, b.tend].minmax
|
194
|
+
|
195
|
+
oa = overlap_amount(astart, astop, bstart, bstop)
|
196
|
+
if oa > 0.75
|
197
|
+
p += 1
|
198
|
+
end
|
199
|
+
else
|
200
|
+
astart, astop = [a.qstart, a.qend].minmax
|
201
|
+
bstart, bstop = [b.qstart, b.qend].minmax
|
202
|
+
|
203
|
+
oa = overlap_amount(astart, astop, bstart, bstop)
|
204
|
+
if oa < 0.25
|
205
|
+
p += 1
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
if p/list.size.to_f >= 0.5
|
212
|
+
@n_chimeras += 1
|
213
|
+
end
|
214
|
+
end
|
215
|
+
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
216
|
+
end
|
217
|
+
|
218
|
+
def overlap(astart, astop, bstart, bstop)
|
219
|
+
if astart == bstart and astop == bstop
|
220
|
+
return 0
|
221
|
+
elsif astart < bstart
|
222
|
+
if astop > bstart
|
223
|
+
if astop > bstop
|
224
|
+
return 4
|
225
|
+
else
|
226
|
+
return 2
|
227
|
+
end
|
228
|
+
else
|
229
|
+
return 5 # no overlap
|
230
|
+
end
|
231
|
+
else
|
232
|
+
if bstop > astart
|
233
|
+
if bstop > astop
|
234
|
+
return 3
|
235
|
+
else
|
236
|
+
return 1
|
237
|
+
end
|
238
|
+
else
|
239
|
+
return 6 # no overlap
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def overlap_amount(astart, astop, bstart, bstop)
|
245
|
+
if astart == bstart and astop == bstop
|
246
|
+
return 1
|
247
|
+
elsif astart < bstart
|
248
|
+
if astop > bstart
|
249
|
+
if astop > bstop
|
250
|
+
return (bstop-bstart+1)/(astop-astart+1).to_f # 4
|
251
|
+
else
|
252
|
+
return (astop-bstart+1)/(bstop-astart+1).to_f # 2
|
253
|
+
end
|
254
|
+
else
|
255
|
+
return 0 # 5 no overlap
|
256
|
+
end
|
257
|
+
else
|
258
|
+
if bstop > astart
|
259
|
+
if bstop > astop
|
260
|
+
return (astop-astart+1)/(bstop-bstart+1).to_f # 3
|
261
|
+
else
|
262
|
+
return (bstop-astart+1)/(astop-bstart+1).to_f # 1
|
263
|
+
end
|
264
|
+
else
|
265
|
+
return 0 # 6 no overlap
|
266
|
+
end
|
267
|
+
end
|
50
268
|
end
|
51
269
|
|
52
270
|
def collapse_factor hits=nil
|
53
271
|
return @collapse_factor unless @collapse_factor.nil?
|
54
272
|
targets = {}
|
55
|
-
hits.each_pair do |query,
|
56
|
-
|
57
|
-
|
58
|
-
targets
|
273
|
+
hits.each_pair do |query, list|
|
274
|
+
list.each do |hit|
|
275
|
+
target = hit.target
|
276
|
+
unless targets.has_key? target
|
277
|
+
targets[target] = Set.new
|
278
|
+
end
|
279
|
+
targets[target] << query
|
59
280
|
end
|
60
|
-
targets[target] << query
|
61
281
|
end
|
62
282
|
sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
|
63
283
|
sum / targets.size
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'inline'
|
3
|
+
|
4
|
+
module Transrate
|
5
|
+
|
6
|
+
# A contig in a transcriptome assembly.
|
7
|
+
class Contig
|
8
|
+
|
9
|
+
include Enumerable
|
10
|
+
extend Forwardable
|
11
|
+
def_delegators :@seq, :size, :length
|
12
|
+
attr_accessor :seq, :name, :coverage
|
13
|
+
|
14
|
+
def initialize(seq, name: nil)
|
15
|
+
@seq = seq
|
16
|
+
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
17
|
+
end
|
18
|
+
|
19
|
+
def each &block
|
20
|
+
@seq.seq.each_char &block
|
21
|
+
end
|
22
|
+
|
23
|
+
# Base composition of the contig
|
24
|
+
def base_composition
|
25
|
+
if @base_composition
|
26
|
+
return @base_composition
|
27
|
+
end
|
28
|
+
base_comp = {
|
29
|
+
:a => 0,
|
30
|
+
:t => 0,
|
31
|
+
:c => 0,
|
32
|
+
:g => 0,
|
33
|
+
:n => 0
|
34
|
+
}
|
35
|
+
dibase_comp = {
|
36
|
+
:cg => 0
|
37
|
+
}
|
38
|
+
last_base = nil
|
39
|
+
@seq.seq.each_char do |base|
|
40
|
+
# single bases
|
41
|
+
key = base.downcase.to_sym
|
42
|
+
base_comp[key] += 1
|
43
|
+
if last_base
|
44
|
+
# pairs of bases
|
45
|
+
dikey = "#{last_base}#{base}".downcase.to_sym
|
46
|
+
if dibase_comp[dikey]
|
47
|
+
dibase_comp[dikey] += 1
|
48
|
+
else
|
49
|
+
dibase_comp[dikey] = 1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
last_base = base
|
53
|
+
end
|
54
|
+
@base_composition = base_comp
|
55
|
+
@dibase_composition = dibase_comp
|
56
|
+
return base_comp
|
57
|
+
end
|
58
|
+
|
59
|
+
# Dibase composition of the contig
|
60
|
+
def dibase_composition
|
61
|
+
if @dibase_composition
|
62
|
+
return @dibase_composition
|
63
|
+
end
|
64
|
+
base_composition
|
65
|
+
@dibase_composition
|
66
|
+
end
|
67
|
+
|
68
|
+
# Number of bases that are C
|
69
|
+
def bases_c
|
70
|
+
base_composition[:c]
|
71
|
+
end
|
72
|
+
|
73
|
+
# Proportion of bases that are C
|
74
|
+
def prop_c
|
75
|
+
bases_c / length.to_f
|
76
|
+
end
|
77
|
+
|
78
|
+
# Number of bases that are G
|
79
|
+
def bases_g
|
80
|
+
base_composition[:g]
|
81
|
+
end
|
82
|
+
|
83
|
+
# Proportion of bases that are G
|
84
|
+
def prop_g
|
85
|
+
bases_g / length.to_f
|
86
|
+
end
|
87
|
+
|
88
|
+
# Number of bases that are A
|
89
|
+
def bases_a
|
90
|
+
base_composition[:a]
|
91
|
+
end
|
92
|
+
|
93
|
+
# Proportion of bases that are A
|
94
|
+
def prop_a
|
95
|
+
bases_a / length.to_f
|
96
|
+
end
|
97
|
+
|
98
|
+
# Number of bases that are T
|
99
|
+
def bases_t
|
100
|
+
base_composition[:t]
|
101
|
+
end
|
102
|
+
|
103
|
+
# Proportion of bases that are T
|
104
|
+
def prop_t
|
105
|
+
bases_t / length.to_f
|
106
|
+
end
|
107
|
+
|
108
|
+
def bases_n
|
109
|
+
base_composition[:n]
|
110
|
+
end
|
111
|
+
|
112
|
+
def prop_n
|
113
|
+
bases_n / length.to_f
|
114
|
+
end
|
115
|
+
|
116
|
+
# GC
|
117
|
+
def bases_gc
|
118
|
+
bases_g + bases_c
|
119
|
+
end
|
120
|
+
|
121
|
+
def prop_gc
|
122
|
+
prop_g + prop_c
|
123
|
+
end
|
124
|
+
|
125
|
+
# GC skew
|
126
|
+
def gc_skew
|
127
|
+
prop_gc / (prop_a + prop_t + prop_gc)
|
128
|
+
end
|
129
|
+
|
130
|
+
# AT skew
|
131
|
+
def at_skew
|
132
|
+
prop_a + prop_t / (prop_a + prop_t + prop_gc)
|
133
|
+
end
|
134
|
+
|
135
|
+
# CpG count
|
136
|
+
def cpg_count
|
137
|
+
dibase_composition[:cg]
|
138
|
+
end
|
139
|
+
|
140
|
+
# CpG (C-phosphate-G) ratio
|
141
|
+
def cpg_ratio
|
142
|
+
dibase_composition[:cg] / (prop_c * prop_g)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Find the longest orf in the contig
|
146
|
+
def orf_length
|
147
|
+
longest = longest_orf @seq.seq
|
148
|
+
return longest
|
149
|
+
end
|
150
|
+
|
151
|
+
# Inlined C longest-ORF function
|
152
|
+
inline do |builder|
|
153
|
+
builder.c <<SRC
|
154
|
+
static
|
155
|
+
void
|
156
|
+
longest_orf(VALUE _s) {
|
157
|
+
int i,sl,longest=0;
|
158
|
+
int len[6];
|
159
|
+
char * c_str;
|
160
|
+
|
161
|
+
sl = RSTRING_LEN(_s);
|
162
|
+
c_str = StringValueCStr(_s);
|
163
|
+
for (i=0;i<6;i++) {
|
164
|
+
len[i]=0;
|
165
|
+
}
|
166
|
+
for (i=0;i<sl-2;i++) {
|
167
|
+
if (c_str[i]=='T' &&
|
168
|
+
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
169
|
+
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
170
|
+
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
171
|
+
if (len[i%3] > longest) {
|
172
|
+
longest = len[i%3];
|
173
|
+
}
|
174
|
+
len[i%3]=0;
|
175
|
+
} else {
|
176
|
+
len[i%3]++;
|
177
|
+
}
|
178
|
+
if (c_str[i+2]=='A' &&
|
179
|
+
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
180
|
+
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
181
|
+
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
182
|
+
if (len[3+i%3] > longest) {
|
183
|
+
longest = len[3+i%3];
|
184
|
+
}
|
185
|
+
len[3+i%3]=0;
|
186
|
+
} else {
|
187
|
+
len[3+i%3]++;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
if (len[i%3] > longest) {
|
191
|
+
longest = len[i%3];
|
192
|
+
}
|
193
|
+
if (len[3+i%3] > longest) {
|
194
|
+
longest = len[3+i%3];
|
195
|
+
}
|
196
|
+
return INT2NUM(longest);
|
197
|
+
}
|
198
|
+
SRC
|
199
|
+
end
|
200
|
+
|
201
|
+
def linguistic_complexity k
|
202
|
+
d = 4 ** k
|
203
|
+
set = Set.new
|
204
|
+
(0..@seq.length-k).each do |i|
|
205
|
+
set << @seq.seq.slice(i,k).upcase # slice(start, length)
|
206
|
+
end # count how many kmers in seq
|
207
|
+
set.size / d.to_f
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|