transrate 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/transrate/assembly.rb +87 -83
- data/lib/transrate/version.rb +1 -1
- metadata +1 -1
data/lib/transrate/assembly.rb
CHANGED
@@ -3,103 +3,107 @@ require 'bettersam'
|
|
3
3
|
require 'csv'
|
4
4
|
require 'forwardable'
|
5
5
|
|
6
|
-
|
6
|
+
module Transrate
|
7
7
|
|
8
|
-
|
9
|
-
extend Forwardable
|
10
|
-
def_delegators :@assembly, :each, :<<
|
8
|
+
class Assembly
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
include Enumerable
|
11
|
+
extend Forwardable
|
12
|
+
def_delegators :@assembly, :each, :<<
|
15
13
|
|
16
|
-
|
17
|
-
|
14
|
+
attr_accessor :ublast_db
|
15
|
+
attr_accessor :orfs_ublast_db
|
16
|
+
attr_accessor :protein
|
18
17
|
|
19
|
-
|
20
|
-
|
18
|
+
# number of bases in the assembly
|
19
|
+
attr_writer :n_bases
|
21
20
|
|
22
|
-
|
23
|
-
|
21
|
+
# assembly filename
|
22
|
+
attr_accessor :file
|
24
23
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
@
|
34
|
-
@
|
24
|
+
# assembly n50
|
25
|
+
attr_reader :n50
|
26
|
+
|
27
|
+
# Reuturn a new Assembly.
|
28
|
+
#
|
29
|
+
# - +:file+ - path to the assembly FASTA file
|
30
|
+
def initialize file
|
31
|
+
@file = file
|
32
|
+
@assembly = []
|
33
|
+
@n_bases = 0
|
34
|
+
Bio::FastaFormat.open(file).each do |entry|
|
35
|
+
@n_bases += entry.length
|
36
|
+
@assembly << entry
|
37
|
+
end
|
38
|
+
@assembly.sort_by! { |x| x.length }
|
35
39
|
end
|
36
|
-
@assembly.sort_by! { |x| x.length }
|
37
|
-
end
|
38
40
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
41
|
+
# Return a new Assembly object by loading sequences
|
42
|
+
# from the FASTA-format +:file+
|
43
|
+
def self.stats_from_fasta file
|
44
|
+
a = Assembly.new file
|
45
|
+
a.basic_stats
|
46
|
+
end
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
def run
|
49
|
+
stats = self.basic_stats
|
50
|
+
stats.each_pair do |key, value|
|
51
|
+
ivar = "@#{key.gsub(/ /, '_')}".to_sym
|
52
|
+
self.instance_variable_set(key, value)
|
53
|
+
end
|
51
54
|
end
|
52
|
-
end
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
56
|
+
# Return a hash of statistics about this assembly
|
57
|
+
def basic_stats
|
58
|
+
cumulative_length = 0.0
|
59
|
+
# we'll calculate Nx for all these x
|
60
|
+
x = [90, 70, 50, 30, 10]
|
61
|
+
x2 = x.clone
|
62
|
+
cutoff = x2.pop / 100.0
|
63
|
+
res = []
|
64
|
+
n1k = 0
|
65
|
+
n10k = 0
|
66
|
+
@assembly.each do |s|
|
67
|
+
new_cum_len = cumulative_length + s.length
|
68
|
+
prop = new_cum_len / self.n_bases
|
69
|
+
n1k += 1 if s.length > 1_000
|
70
|
+
n10k += 1 if s.length > 10_000
|
71
|
+
if prop >= cutoff
|
72
|
+
res << s.length
|
73
|
+
break if x2.empty?
|
74
|
+
cutoff = x2.pop / 100.0
|
75
|
+
end
|
76
|
+
cumulative_length = new_cum_len
|
73
77
|
end
|
74
|
-
|
78
|
+
mean = cumulative_length / @assembly.size
|
79
|
+
ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
|
80
|
+
{
|
81
|
+
"n_seqs" => @assembly.size,
|
82
|
+
"smallest" => @assembly.first.length,
|
83
|
+
"largest" => @assembly.last.length,
|
84
|
+
"n_bases" => @n_bases,
|
85
|
+
"mean_len" => mean,
|
86
|
+
"n > 1k" => n1k,
|
87
|
+
"n > 10k" => n10k
|
88
|
+
}.merge ns
|
75
89
|
end
|
76
|
-
mean = cumulative_length / @assembly.size
|
77
|
-
ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
|
78
|
-
{
|
79
|
-
"n_seqs" => @assembly.size,
|
80
|
-
"smallest" => @assembly.first.length,
|
81
|
-
"largest" => @assembly.last.length,
|
82
|
-
"n_bases" => @n_bases,
|
83
|
-
"mean_len" => mean,
|
84
|
-
"n > 1k" => n1k,
|
85
|
-
"n > 10k" => n10k
|
86
|
-
}.merge ns
|
87
|
-
end
|
88
90
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
91
|
+
# return the number of bases in the assembly, calculating
|
92
|
+
# from the assembly if it hasn't already been done.
|
93
|
+
def n_bases
|
94
|
+
unless @n_bases
|
95
|
+
@n_bases = 0
|
96
|
+
@assembly.each { |s| @n_bases += s.length }
|
97
|
+
end
|
98
|
+
@n_bases
|
99
|
+
end
|
100
|
+
|
101
|
+
def print_stats
|
102
|
+
self.basic_stats.map do |k, v|
|
103
|
+
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
104
|
+
end.join("\n")
|
95
105
|
end
|
96
|
-
@n_bases
|
97
|
-
end
|
98
106
|
|
99
|
-
|
100
|
-
self.basic_stats.map do |k, v|
|
101
|
-
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
102
|
-
end.join("\n")
|
103
|
-
end
|
107
|
+
end # Assembly
|
104
108
|
|
105
|
-
end #
|
109
|
+
end # Transrate
|
data/lib/transrate/version.rb
CHANGED