transrate 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/transrate/assembly.rb +87 -83
- data/lib/transrate/version.rb +1 -1
- metadata +1 -1
data/lib/transrate/assembly.rb
CHANGED
@@ -3,103 +3,107 @@ require 'bettersam'
|
|
3
3
|
require 'csv'
|
4
4
|
require 'forwardable'
|
5
5
|
|
6
|
-
|
6
|
+
module Transrate
|
7
7
|
|
8
|
-
|
9
|
-
extend Forwardable
|
10
|
-
def_delegators :@assembly, :each, :<<
|
8
|
+
class Assembly
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
include Enumerable
|
11
|
+
extend Forwardable
|
12
|
+
def_delegators :@assembly, :each, :<<
|
15
13
|
|
16
|
-
|
17
|
-
|
14
|
+
attr_accessor :ublast_db
|
15
|
+
attr_accessor :orfs_ublast_db
|
16
|
+
attr_accessor :protein
|
18
17
|
|
19
|
-
|
20
|
-
|
18
|
+
# number of bases in the assembly
|
19
|
+
attr_writer :n_bases
|
21
20
|
|
22
|
-
|
23
|
-
|
21
|
+
# assembly filename
|
22
|
+
attr_accessor :file
|
24
23
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
@
|
34
|
-
@
|
24
|
+
# assembly n50
|
25
|
+
attr_reader :n50
|
26
|
+
|
27
|
+
# Reuturn a new Assembly.
|
28
|
+
#
|
29
|
+
# - +:file+ - path to the assembly FASTA file
|
30
|
+
def initialize file
|
31
|
+
@file = file
|
32
|
+
@assembly = []
|
33
|
+
@n_bases = 0
|
34
|
+
Bio::FastaFormat.open(file).each do |entry|
|
35
|
+
@n_bases += entry.length
|
36
|
+
@assembly << entry
|
37
|
+
end
|
38
|
+
@assembly.sort_by! { |x| x.length }
|
35
39
|
end
|
36
|
-
@assembly.sort_by! { |x| x.length }
|
37
|
-
end
|
38
40
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
41
|
+
# Return a new Assembly object by loading sequences
|
42
|
+
# from the FASTA-format +:file+
|
43
|
+
def self.stats_from_fasta file
|
44
|
+
a = Assembly.new file
|
45
|
+
a.basic_stats
|
46
|
+
end
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
def run
|
49
|
+
stats = self.basic_stats
|
50
|
+
stats.each_pair do |key, value|
|
51
|
+
ivar = "@#{key.gsub(/ /, '_')}".to_sym
|
52
|
+
self.instance_variable_set(key, value)
|
53
|
+
end
|
51
54
|
end
|
52
|
-
end
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
56
|
+
# Return a hash of statistics about this assembly
|
57
|
+
def basic_stats
|
58
|
+
cumulative_length = 0.0
|
59
|
+
# we'll calculate Nx for all these x
|
60
|
+
x = [90, 70, 50, 30, 10]
|
61
|
+
x2 = x.clone
|
62
|
+
cutoff = x2.pop / 100.0
|
63
|
+
res = []
|
64
|
+
n1k = 0
|
65
|
+
n10k = 0
|
66
|
+
@assembly.each do |s|
|
67
|
+
new_cum_len = cumulative_length + s.length
|
68
|
+
prop = new_cum_len / self.n_bases
|
69
|
+
n1k += 1 if s.length > 1_000
|
70
|
+
n10k += 1 if s.length > 10_000
|
71
|
+
if prop >= cutoff
|
72
|
+
res << s.length
|
73
|
+
break if x2.empty?
|
74
|
+
cutoff = x2.pop / 100.0
|
75
|
+
end
|
76
|
+
cumulative_length = new_cum_len
|
73
77
|
end
|
74
|
-
|
78
|
+
mean = cumulative_length / @assembly.size
|
79
|
+
ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
|
80
|
+
{
|
81
|
+
"n_seqs" => @assembly.size,
|
82
|
+
"smallest" => @assembly.first.length,
|
83
|
+
"largest" => @assembly.last.length,
|
84
|
+
"n_bases" => @n_bases,
|
85
|
+
"mean_len" => mean,
|
86
|
+
"n > 1k" => n1k,
|
87
|
+
"n > 10k" => n10k
|
88
|
+
}.merge ns
|
75
89
|
end
|
76
|
-
mean = cumulative_length / @assembly.size
|
77
|
-
ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
|
78
|
-
{
|
79
|
-
"n_seqs" => @assembly.size,
|
80
|
-
"smallest" => @assembly.first.length,
|
81
|
-
"largest" => @assembly.last.length,
|
82
|
-
"n_bases" => @n_bases,
|
83
|
-
"mean_len" => mean,
|
84
|
-
"n > 1k" => n1k,
|
85
|
-
"n > 10k" => n10k
|
86
|
-
}.merge ns
|
87
|
-
end
|
88
90
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
91
|
+
# return the number of bases in the assembly, calculating
|
92
|
+
# from the assembly if it hasn't already been done.
|
93
|
+
def n_bases
|
94
|
+
unless @n_bases
|
95
|
+
@n_bases = 0
|
96
|
+
@assembly.each { |s| @n_bases += s.length }
|
97
|
+
end
|
98
|
+
@n_bases
|
99
|
+
end
|
100
|
+
|
101
|
+
def print_stats
|
102
|
+
self.basic_stats.map do |k, v|
|
103
|
+
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
104
|
+
end.join("\n")
|
95
105
|
end
|
96
|
-
@n_bases
|
97
|
-
end
|
98
106
|
|
99
|
-
|
100
|
-
self.basic_stats.map do |k, v|
|
101
|
-
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
102
|
-
end.join("\n")
|
103
|
-
end
|
107
|
+
end # Assembly
|
104
108
|
|
105
|
-
end #
|
109
|
+
end # Transrate
|
data/lib/transrate/version.rb
CHANGED