transrate 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +16 -1
- data/.travis.yml +8 -0
- data/README.md +45 -43
- data/Rakefile +36 -0
- data/bin/transrate +98 -50
- data/deps/deps.yaml +55 -0
- data/lib/transrate.rb +19 -4
- data/lib/transrate/assembly.rb +93 -182
- data/lib/transrate/bowtie2.rb +37 -13
- data/lib/transrate/cmd.rb +19 -0
- data/lib/transrate/comparative_metrics.rb +239 -19
- data/lib/transrate/contig.rb +212 -0
- data/lib/transrate/contig_metrics.rb +76 -0
- data/lib/transrate/read_metrics.rb +83 -41
- data/lib/transrate/samtools.rb +73 -0
- data/lib/transrate/transrater.rb +31 -11
- data/lib/transrate/version.rb +1 -1
- data/test/data/150uncovered.l.fq +892 -0
- data/test/data/150uncovered.r.fq +892 -0
- data/test/data/Os.protein.2.fa +95 -0
- data/test/data/Os.protein.fa +199 -0
- data/test/data/assembly.2.fa +26 -0
- data/test/{assembly.fasta → data/assembly.fasta} +0 -0
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/data/sorghum_transcript.fa +4 -0
- data/test/data/tiny.sam +4 -0
- data/test/helper.rb +33 -2
- data/test/test_bowtie.rb +54 -0
- data/test/test_cmd.rb +15 -0
- data/test/test_comp_metrics.rb +177 -0
- data/test/test_contig.rb +61 -0
- data/test/test_contig_metrics.rb +50 -0
- data/test/test_inline.rb +10 -9
- data/test/test_read_metrics.rb +68 -0
- data/test/test_samtools.rb +22 -0
- data/test/test_transrate.rb +40 -0
- data/test/test_transrater.rb +68 -0
- data/transrate.gemspec +16 -10
- metadata +232 -57
- data/lib/transrate/express.rb +0 -37
- data/lib/transrate/log.rb +0 -16
- data/lib/transrate/rb_hit.rb +0 -33
- data/lib/transrate/reciprocal_annotation.rb +0 -105
- data/lib/transrate/usearch.rb +0 -66
- data/test/test_test.rb +0 -41
data/lib/transrate/bowtie2.rb
CHANGED
@@ -1,42 +1,66 @@
|
|
1
1
|
module Transrate
|
2
2
|
|
3
|
+
class Bowtie2Error < StandardError
|
4
|
+
end
|
5
|
+
|
3
6
|
class Bowtie2
|
4
7
|
|
5
8
|
require 'which'
|
6
9
|
include Which
|
7
10
|
|
11
|
+
attr_reader :index_name, :sam
|
12
|
+
|
8
13
|
def initialize
|
9
14
|
bowtie2_path = which('bowtie2')
|
10
|
-
|
15
|
+
if bowtie2_path.empty?
|
16
|
+
raise Bowtie2Error.new("could not find bowtie2 in the path")
|
17
|
+
end
|
11
18
|
@bowtie2 = bowtie2_path.first
|
12
19
|
bowtie2_build_path = which('bowtie2-build')
|
13
|
-
|
20
|
+
if bowtie2_build_path.empty?
|
21
|
+
raise Bowtie2Error.new("could not find bowtie2-build in the path")
|
22
|
+
end
|
14
23
|
@bowtie2_build = bowtie2_build_path.first
|
24
|
+
@index_built = false
|
25
|
+
@index_name = ""
|
15
26
|
end
|
16
27
|
|
17
|
-
def map_reads
|
28
|
+
def map_reads(file, left,
|
29
|
+
right, insertsize: 200,
|
30
|
+
insertsd: 50, outputname: nil,
|
31
|
+
threads: 8)
|
32
|
+
raise Bowtie2Error.new("Index not built") if !@index_built
|
18
33
|
lbase = File.basename(left)
|
19
34
|
rbase = File.basename(right)
|
20
|
-
|
35
|
+
index = File.basename(@index_name)
|
36
|
+
@sam = File.expand_path("#{lbase}.#{rbase}.#{index}.sam")
|
21
37
|
realistic_dist = insertsize + (3 * insertsd)
|
22
|
-
unless File.exists?
|
38
|
+
unless File.exists? @sam
|
23
39
|
# construct bowtie command
|
24
|
-
bowtiecmd = "#{@bowtie2} --very-sensitive
|
25
|
-
bowtiecmd += "
|
26
|
-
bowtiecmd += "
|
40
|
+
bowtiecmd = "#{@bowtie2} --very-sensitive"
|
41
|
+
bowtiecmd += " -p #{threads} -X #{realistic_dist}"
|
42
|
+
bowtiecmd += " --quiet --no-unal"
|
43
|
+
bowtiecmd += " --seed 1337"
|
44
|
+
bowtiecmd += " -x #{@index_name}"
|
45
|
+
bowtiecmd += " -1 #{left}"
|
27
46
|
# paired end?
|
28
47
|
bowtiecmd += " -2 #{right}" if right
|
29
|
-
bowtiecmd += "
|
48
|
+
bowtiecmd += " -S #{@sam}"
|
30
49
|
# run bowtie
|
31
|
-
|
50
|
+
runner = Cmd.new bowtiecmd
|
51
|
+
runner.run
|
32
52
|
end
|
33
|
-
|
53
|
+
@sam
|
34
54
|
end
|
35
55
|
|
36
56
|
def build_index file
|
37
|
-
|
38
|
-
|
57
|
+
@index_name = File.basename(file).split(".")[0..-2].join(".")
|
58
|
+
unless File.exists?(@index_name + '.1.bt2')
|
59
|
+
cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
|
60
|
+
runner = Cmd.new cmd
|
61
|
+
runner.run
|
39
62
|
end
|
63
|
+
@index_built = true
|
40
64
|
end
|
41
65
|
|
42
66
|
end # Bowtie2
|
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'set'
|
2
|
+
require 'crb-blast'
|
2
3
|
|
3
4
|
module Transrate
|
4
|
-
|
5
|
+
|
5
6
|
class ComparativeMetrics
|
6
7
|
|
7
8
|
attr_reader :rbh_per_contig
|
@@ -9,21 +10,26 @@ module Transrate
|
|
9
10
|
attr_reader :reciprocal_hits
|
10
11
|
attr_reader :has_run
|
11
12
|
attr_reader :reference_coverage
|
13
|
+
attr_reader :n_chimeras, :p_chimeras
|
12
14
|
|
13
|
-
def initialize assembly, reference
|
15
|
+
def initialize assembly, reference, threads
|
14
16
|
@assembly = assembly
|
15
17
|
@reference = reference
|
16
|
-
@
|
18
|
+
@threads = threads
|
17
19
|
end
|
18
20
|
|
19
21
|
def run
|
20
|
-
|
21
|
-
@ortholog_hit_ratio = ortholog_hit_ratio
|
22
|
-
@collapse_factor = collapse_factor @
|
23
|
-
@reciprocal_hits =
|
22
|
+
@crbblast = reciprocal_best_blast
|
23
|
+
@ortholog_hit_ratio = ortholog_hit_ratio @crbblast
|
24
|
+
@collapse_factor = collapse_factor @crbblast.target_results
|
25
|
+
@reciprocal_hits = @crbblast.size
|
24
26
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
25
27
|
@reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
|
26
28
|
@rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
|
29
|
+
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
|
+
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
|
+
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
32
|
+
chimeras @crbblast
|
27
33
|
@has_run = true
|
28
34
|
end
|
29
35
|
|
@@ -31,33 +37,247 @@ module Transrate
|
|
31
37
|
{
|
32
38
|
:reciprocal_hits => @reciprocal_hits,
|
33
39
|
:rbh_per_contig => @rbh_per_contig,
|
40
|
+
:p_contigs_with_recip => @p_contigs_with_recip,
|
41
|
+
:n_contigs_with_recip => @n_contigs_with_recip,
|
42
|
+
:p_refs_with_recip => @p_refs_with_recip,
|
43
|
+
:n_refs_with_recip => @n_refs_with_recip,
|
34
44
|
:rbh_per_reference => @rbh_per_reference,
|
35
45
|
:reference_coverage => @reference_coverage,
|
36
46
|
:ortholog_hit_ratio => @ortholog_hit_ratio,
|
37
|
-
:collapse_factor => @collapse_factor
|
47
|
+
:collapse_factor => @collapse_factor,
|
48
|
+
:n_chimeras => @n_chimeras,
|
49
|
+
:p_chimeras => @p_chimeras,
|
50
|
+
:cov25 => @cov[0],
|
51
|
+
:cov50 => @cov[1],
|
52
|
+
:cov75 => @cov[2],
|
53
|
+
:cov85 => @cov[3],
|
54
|
+
:cov95 => @cov[4],
|
55
|
+
:p_cov25 => @cov[0]/@reference.size.to_f,
|
56
|
+
:p_cov50 => @cov[1]/@reference.size.to_f,
|
57
|
+
:p_cov75 => @cov[2]/@reference.size.to_f,
|
58
|
+
:p_cov85 => @cov[3]/@reference.size.to_f,
|
59
|
+
:p_cov95 => @cov[4]/@reference.size.to_f
|
38
60
|
}
|
39
61
|
end
|
40
62
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
63
|
+
def reciprocal_best_blast
|
64
|
+
crbblast = CRB_Blast.new @assembly.file, @reference.file
|
65
|
+
crbblast.run 1e-5, @threads
|
66
|
+
crbblast
|
44
67
|
end
|
45
68
|
|
46
|
-
|
69
|
+
# coverage of contigs that have reciprocal hits
|
70
|
+
# divided by
|
71
|
+
# number of reciprocal targets
|
72
|
+
def ortholog_hit_ratio crbblast
|
47
73
|
return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
|
48
|
-
|
49
|
-
|
74
|
+
|
75
|
+
targets = Hash.new
|
76
|
+
crbblast.reciprocals.each_pair do |key, list|
|
77
|
+
list.each do |hit|
|
78
|
+
targets[hit.target] ||= [] # if key doesn't exist add it with a []
|
79
|
+
targets[hit.target] << hit
|
80
|
+
end
|
81
|
+
end
|
82
|
+
@n_refs_with_recip = targets.size
|
83
|
+
total_coverage=0
|
84
|
+
total_length=0
|
85
|
+
targets.each_pair do |key, list|
|
86
|
+
blocks = []
|
87
|
+
target_length = 0
|
88
|
+
list.each do |hit|
|
89
|
+
target_length = hit.tlen
|
90
|
+
if crbblast.target_is_prot
|
91
|
+
target_length *= 3
|
92
|
+
start, stop = [hit.tstart*3, hit.tend*3].minmax
|
93
|
+
else
|
94
|
+
start, stop = [hit.tstart, hit.tend].minmax
|
95
|
+
end
|
96
|
+
if blocks.empty?
|
97
|
+
blocks << [start, stop]
|
98
|
+
else
|
99
|
+
found=false
|
100
|
+
blocks.each do |block|
|
101
|
+
# if query overlaps with any block extend that block
|
102
|
+
o = overlap(block[0], block[1], start, stop)
|
103
|
+
if o == 0 # perfect overlap
|
104
|
+
found=true
|
105
|
+
elsif o == 1 # partial overlap
|
106
|
+
block[0] = start
|
107
|
+
found=true
|
108
|
+
elsif o == 2 # partial overlap
|
109
|
+
block[1] = stop
|
110
|
+
found=true
|
111
|
+
elsif o == 3 # full overlap
|
112
|
+
block[0] = start
|
113
|
+
block[1] = stop
|
114
|
+
found=true
|
115
|
+
# elsif o == 4 # full overlap
|
116
|
+
# nothing
|
117
|
+
# elsif o == 5 || o == 6 # no overlap
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
121
|
+
if !found
|
122
|
+
blocks << [start, stop]
|
123
|
+
end
|
124
|
+
# if any blocks now overlap then extend one block and remove
|
125
|
+
# the other
|
126
|
+
end
|
127
|
+
end
|
128
|
+
blocks.each_with_index do |block_a,a|
|
129
|
+
blocks.each_with_index do |block_b,b|
|
130
|
+
if a!=b
|
131
|
+
o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
|
132
|
+
if o == 0 # perfect overlap
|
133
|
+
block_b[0]=-1
|
134
|
+
block_b[1]=-1
|
135
|
+
elsif o == 1 # partial overlap
|
136
|
+
block_a[0] = block_b[0]
|
137
|
+
block_b[0] = -1
|
138
|
+
block_b[1] = -1
|
139
|
+
elsif o == 2 # partial overlap
|
140
|
+
block_a[1] = block_b[1]
|
141
|
+
block_b[0] = -1
|
142
|
+
block_b[1] = -1
|
143
|
+
elsif o == 3 # full overlap
|
144
|
+
block_a[0] = block_b[0]
|
145
|
+
block_a[1] = block_b[1]
|
146
|
+
block_b[0] = -1
|
147
|
+
block_b[1] = -1
|
148
|
+
elsif o == 4 # full overlap
|
149
|
+
block_b[0] = -1
|
150
|
+
block_b[1] = -1
|
151
|
+
# elsif o == 5 || o == 6# no overlap
|
152
|
+
# do nothing
|
153
|
+
# elsif # no overlap
|
154
|
+
# do nothing
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end # each_with_index b
|
158
|
+
end # each_with_index a
|
159
|
+
# sum blocks to find total coverage
|
160
|
+
length_of_coverage=0
|
161
|
+
blocks.each do |block|
|
162
|
+
if block[0] and block[1]
|
163
|
+
if block[0]>=0 and block[1]>=0
|
164
|
+
length_of_coverage += block[1] - block[0] + 1
|
165
|
+
end
|
166
|
+
else
|
167
|
+
puts "error: key = #{key}, #{blocks}"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
171
|
+
@cov ||= [0, 0, 0, 0, 0]
|
172
|
+
p = length_of_coverage / target_length.to_f
|
173
|
+
cov.each_with_index do |c, i|
|
174
|
+
if p >= c
|
175
|
+
@cov[i] +=1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
total_coverage += length_of_coverage
|
179
|
+
total_length += target_length
|
180
|
+
end
|
181
|
+
return ortholog_hit_ratio = total_coverage / total_length.to_f
|
182
|
+
end
|
183
|
+
|
184
|
+
def chimeras crbblast
|
185
|
+
@n_chimeras = 0
|
186
|
+
crbblast.reciprocals.each_pair do |key, list|
|
187
|
+
p = 0
|
188
|
+
list.each_with_index do |a, i|
|
189
|
+
list.each_with_index do |b, j|
|
190
|
+
if j>i
|
191
|
+
if a.target == b.target
|
192
|
+
astart, astop = [a.tstart, a.tend].minmax
|
193
|
+
bstart, bstop = [b.tstart, b.tend].minmax
|
194
|
+
|
195
|
+
oa = overlap_amount(astart, astop, bstart, bstop)
|
196
|
+
if oa > 0.75
|
197
|
+
p += 1
|
198
|
+
end
|
199
|
+
else
|
200
|
+
astart, astop = [a.qstart, a.qend].minmax
|
201
|
+
bstart, bstop = [b.qstart, b.qend].minmax
|
202
|
+
|
203
|
+
oa = overlap_amount(astart, astop, bstart, bstop)
|
204
|
+
if oa < 0.25
|
205
|
+
p += 1
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
if p/list.size.to_f >= 0.5
|
212
|
+
@n_chimeras += 1
|
213
|
+
end
|
214
|
+
end
|
215
|
+
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
216
|
+
end
|
217
|
+
|
218
|
+
def overlap(astart, astop, bstart, bstop)
|
219
|
+
if astart == bstart and astop == bstop
|
220
|
+
return 0
|
221
|
+
elsif astart < bstart
|
222
|
+
if astop > bstart
|
223
|
+
if astop > bstop
|
224
|
+
return 4
|
225
|
+
else
|
226
|
+
return 2
|
227
|
+
end
|
228
|
+
else
|
229
|
+
return 5 # no overlap
|
230
|
+
end
|
231
|
+
else
|
232
|
+
if bstop > astart
|
233
|
+
if bstop > astop
|
234
|
+
return 3
|
235
|
+
else
|
236
|
+
return 1
|
237
|
+
end
|
238
|
+
else
|
239
|
+
return 6 # no overlap
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def overlap_amount(astart, astop, bstart, bstop)
|
245
|
+
if astart == bstart and astop == bstop
|
246
|
+
return 1
|
247
|
+
elsif astart < bstart
|
248
|
+
if astop > bstart
|
249
|
+
if astop > bstop
|
250
|
+
return (bstop-bstart+1)/(astop-astart+1).to_f # 4
|
251
|
+
else
|
252
|
+
return (astop-bstart+1)/(bstop-astart+1).to_f # 2
|
253
|
+
end
|
254
|
+
else
|
255
|
+
return 0 # 5 no overlap
|
256
|
+
end
|
257
|
+
else
|
258
|
+
if bstop > astart
|
259
|
+
if bstop > astop
|
260
|
+
return (astop-astart+1)/(bstop-bstart+1).to_f # 3
|
261
|
+
else
|
262
|
+
return (bstop-astart+1)/(astop-bstart+1).to_f # 1
|
263
|
+
end
|
264
|
+
else
|
265
|
+
return 0 # 6 no overlap
|
266
|
+
end
|
267
|
+
end
|
50
268
|
end
|
51
269
|
|
52
270
|
def collapse_factor hits=nil
|
53
271
|
return @collapse_factor unless @collapse_factor.nil?
|
54
272
|
targets = {}
|
55
|
-
hits.each_pair do |query,
|
56
|
-
|
57
|
-
|
58
|
-
targets
|
273
|
+
hits.each_pair do |query, list|
|
274
|
+
list.each do |hit|
|
275
|
+
target = hit.target
|
276
|
+
unless targets.has_key? target
|
277
|
+
targets[target] = Set.new
|
278
|
+
end
|
279
|
+
targets[target] << query
|
59
280
|
end
|
60
|
-
targets[target] << query
|
61
281
|
end
|
62
282
|
sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
|
63
283
|
sum / targets.size
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'inline'
|
3
|
+
|
4
|
+
module Transrate
|
5
|
+
|
6
|
+
# A contig in a transcriptome assembly.
|
7
|
+
class Contig
|
8
|
+
|
9
|
+
include Enumerable
|
10
|
+
extend Forwardable
|
11
|
+
def_delegators :@seq, :size, :length
|
12
|
+
attr_accessor :seq, :name, :coverage
|
13
|
+
|
14
|
+
def initialize(seq, name: nil)
|
15
|
+
@seq = seq
|
16
|
+
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
17
|
+
end
|
18
|
+
|
19
|
+
def each &block
|
20
|
+
@seq.seq.each_char &block
|
21
|
+
end
|
22
|
+
|
23
|
+
# Base composition of the contig
|
24
|
+
def base_composition
|
25
|
+
if @base_composition
|
26
|
+
return @base_composition
|
27
|
+
end
|
28
|
+
base_comp = {
|
29
|
+
:a => 0,
|
30
|
+
:t => 0,
|
31
|
+
:c => 0,
|
32
|
+
:g => 0,
|
33
|
+
:n => 0
|
34
|
+
}
|
35
|
+
dibase_comp = {
|
36
|
+
:cg => 0
|
37
|
+
}
|
38
|
+
last_base = nil
|
39
|
+
@seq.seq.each_char do |base|
|
40
|
+
# single bases
|
41
|
+
key = base.downcase.to_sym
|
42
|
+
base_comp[key] += 1
|
43
|
+
if last_base
|
44
|
+
# pairs of bases
|
45
|
+
dikey = "#{last_base}#{base}".downcase.to_sym
|
46
|
+
if dibase_comp[dikey]
|
47
|
+
dibase_comp[dikey] += 1
|
48
|
+
else
|
49
|
+
dibase_comp[dikey] = 1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
last_base = base
|
53
|
+
end
|
54
|
+
@base_composition = base_comp
|
55
|
+
@dibase_composition = dibase_comp
|
56
|
+
return base_comp
|
57
|
+
end
|
58
|
+
|
59
|
+
# Dibase composition of the contig
|
60
|
+
def dibase_composition
|
61
|
+
if @dibase_composition
|
62
|
+
return @dibase_composition
|
63
|
+
end
|
64
|
+
base_composition
|
65
|
+
@dibase_composition
|
66
|
+
end
|
67
|
+
|
68
|
+
# Number of bases that are C
|
69
|
+
def bases_c
|
70
|
+
base_composition[:c]
|
71
|
+
end
|
72
|
+
|
73
|
+
# Proportion of bases that are C
|
74
|
+
def prop_c
|
75
|
+
bases_c / length.to_f
|
76
|
+
end
|
77
|
+
|
78
|
+
# Number of bases that are G
|
79
|
+
def bases_g
|
80
|
+
base_composition[:g]
|
81
|
+
end
|
82
|
+
|
83
|
+
# Proportion of bases that are G
|
84
|
+
def prop_g
|
85
|
+
bases_g / length.to_f
|
86
|
+
end
|
87
|
+
|
88
|
+
# Number of bases that are A
|
89
|
+
def bases_a
|
90
|
+
base_composition[:a]
|
91
|
+
end
|
92
|
+
|
93
|
+
# Proportion of bases that are A
|
94
|
+
def prop_a
|
95
|
+
bases_a / length.to_f
|
96
|
+
end
|
97
|
+
|
98
|
+
# Number of bases that are T
|
99
|
+
def bases_t
|
100
|
+
base_composition[:t]
|
101
|
+
end
|
102
|
+
|
103
|
+
# Proportion of bases that are T
|
104
|
+
def prop_t
|
105
|
+
bases_t / length.to_f
|
106
|
+
end
|
107
|
+
|
108
|
+
def bases_n
|
109
|
+
base_composition[:n]
|
110
|
+
end
|
111
|
+
|
112
|
+
def prop_n
|
113
|
+
bases_n / length.to_f
|
114
|
+
end
|
115
|
+
|
116
|
+
# GC
|
117
|
+
def bases_gc
|
118
|
+
bases_g + bases_c
|
119
|
+
end
|
120
|
+
|
121
|
+
def prop_gc
|
122
|
+
prop_g + prop_c
|
123
|
+
end
|
124
|
+
|
125
|
+
# GC skew
|
126
|
+
def gc_skew
|
127
|
+
prop_gc / (prop_a + prop_t + prop_gc)
|
128
|
+
end
|
129
|
+
|
130
|
+
# AT skew
|
131
|
+
def at_skew
|
132
|
+
prop_a + prop_t / (prop_a + prop_t + prop_gc)
|
133
|
+
end
|
134
|
+
|
135
|
+
# CpG count
|
136
|
+
def cpg_count
|
137
|
+
dibase_composition[:cg]
|
138
|
+
end
|
139
|
+
|
140
|
+
# CpG (C-phosphate-G) ratio
|
141
|
+
def cpg_ratio
|
142
|
+
dibase_composition[:cg] / (prop_c * prop_g)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Find the longest orf in the contig
|
146
|
+
def orf_length
|
147
|
+
longest = longest_orf @seq.seq
|
148
|
+
return longest
|
149
|
+
end
|
150
|
+
|
151
|
+
# Inlined C longest-ORF function
|
152
|
+
inline do |builder|
|
153
|
+
builder.c <<SRC
|
154
|
+
static
|
155
|
+
void
|
156
|
+
longest_orf(VALUE _s) {
|
157
|
+
int i,sl,longest=0;
|
158
|
+
int len[6];
|
159
|
+
char * c_str;
|
160
|
+
|
161
|
+
sl = RSTRING_LEN(_s);
|
162
|
+
c_str = StringValueCStr(_s);
|
163
|
+
for (i=0;i<6;i++) {
|
164
|
+
len[i]=0;
|
165
|
+
}
|
166
|
+
for (i=0;i<sl-2;i++) {
|
167
|
+
if (c_str[i]=='T' &&
|
168
|
+
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
169
|
+
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
170
|
+
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
171
|
+
if (len[i%3] > longest) {
|
172
|
+
longest = len[i%3];
|
173
|
+
}
|
174
|
+
len[i%3]=0;
|
175
|
+
} else {
|
176
|
+
len[i%3]++;
|
177
|
+
}
|
178
|
+
if (c_str[i+2]=='A' &&
|
179
|
+
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
180
|
+
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
181
|
+
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
182
|
+
if (len[3+i%3] > longest) {
|
183
|
+
longest = len[3+i%3];
|
184
|
+
}
|
185
|
+
len[3+i%3]=0;
|
186
|
+
} else {
|
187
|
+
len[3+i%3]++;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
if (len[i%3] > longest) {
|
191
|
+
longest = len[i%3];
|
192
|
+
}
|
193
|
+
if (len[3+i%3] > longest) {
|
194
|
+
longest = len[3+i%3];
|
195
|
+
}
|
196
|
+
return INT2NUM(longest);
|
197
|
+
}
|
198
|
+
SRC
|
199
|
+
end
|
200
|
+
|
201
|
+
def linguistic_complexity k
|
202
|
+
d = 4 ** k
|
203
|
+
set = Set.new
|
204
|
+
(0..@seq.length-k).each do |i|
|
205
|
+
set << @seq.seq.slice(i,k).upcase # slice(start, length)
|
206
|
+
end # count how many kmers in seq
|
207
|
+
set.size / d.to_f
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|