transrate 1.0.0.beta1 → 1.0.0.beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +8 -0
- data/CITATION +3 -0
- data/README.md +1 -1
- data/Rakefile +71 -0
- data/bin/transrate +92 -41
- data/deps/blast.yaml +27 -0
- data/deps/deps.yaml +36 -62
- data/ext/transrate/transrate.c +9 -0
- data/lib/transrate/assembly.rb +21 -11
- data/lib/transrate/comparative_metrics.rb +2 -2
- data/lib/transrate/contig.rb +8 -3
- data/lib/transrate/read_metrics.rb +22 -62
- data/lib/transrate/salmon.rb +67 -0
- data/lib/transrate/snap.rb +4 -32
- data/lib/transrate/transrater.rb +1 -1
- data/lib/transrate/version.rb +1 -1
- data/lib/transrate.rb +18 -15
- data/test/data/sorghum_100.fa +200 -0
- data/test/data/test.sf +30 -0
- data/test/helper.rb +13 -0
- data/test/test_assembly.rb +54 -0
- data/test/test_bin.rb +30 -27
- data/test/test_cmd.rb +5 -0
- data/test/test_contig.rb +9 -14
- data/test/test_read_metrics.rb +66 -42
- data/test/test_salmon.rb +33 -0
- data/test/test_snap.rb +27 -0
- data/test/test_transrater.rb +10 -10
- data/transrate.gemspec +1 -1
- metadata +14 -12
- data/lib/transrate/express.rb +0 -102
- data/lib/transrate/sam_checker.rb +0 -74
- data/lib/transrate/samtools.rb +0 -146
- data/test/data/express_results.xprs +0 -5
- data/test/test_express.rb +0 -22
- data/test/test_samtools.rb +0 -22
@@ -11,7 +11,7 @@ module Transrate
|
|
11
11
|
def initialize assembly
|
12
12
|
@assembly = assembly
|
13
13
|
@mapper = Snap.new
|
14
|
-
@
|
14
|
+
@salmon = Salmon.new
|
15
15
|
self.initial_values
|
16
16
|
|
17
17
|
load_executables
|
@@ -19,7 +19,6 @@ module Transrate
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def load_executables
|
22
|
-
@bam_splitter = get_bin_path 'bam-split'
|
23
22
|
@bam_reader = get_bin_path 'bam-read'
|
24
23
|
end
|
25
24
|
|
@@ -27,7 +26,7 @@ module Transrate
|
|
27
26
|
which_bin = Cmd.new("which #{bin}")
|
28
27
|
which_bin.run
|
29
28
|
if !which_bin.status.success?
|
30
|
-
raise
|
29
|
+
raise TransrateIOError.new("ReadMetrics: could not find #{bin} in path")
|
31
30
|
end
|
32
31
|
which_bin.stdout.split("\n").first
|
33
32
|
end
|
@@ -35,10 +34,11 @@ module Transrate
|
|
35
34
|
def run left, right, insertsize:200, insertsd:50, threads:8
|
36
35
|
# check all read files exist
|
37
36
|
[left, right].each do |readfile|
|
38
|
-
raise
|
37
|
+
raise TransrateIOError.new "Read file is nil" if readfile.nil?
|
39
38
|
readfile.split(",").each do |file|
|
40
39
|
unless File.exist? file
|
41
|
-
raise
|
40
|
+
raise TransrateIOError.new "ReadMetrics: read file does not " +
|
41
|
+
"exist: #{file}"
|
42
42
|
end
|
43
43
|
end
|
44
44
|
end
|
@@ -54,38 +54,18 @@ module Transrate
|
|
54
54
|
threads: threads)
|
55
55
|
@fragments = @mapper.read_count
|
56
56
|
|
57
|
-
|
58
|
-
|
59
|
-
assigned_bam = "hits.1.samp.bam"
|
60
|
-
readsorted_bam = "#{File.basename(bamfile, '.bam')}.readsorted.bam"
|
61
|
-
valid_bam = "#{File.basename(bamfile, '.bam')}.valid.bam"
|
62
|
-
invalid_bam = "#{File.basename(bamfile, '.bam')}.invalid.bam"
|
57
|
+
assigned_bam = "postSample.bam"
|
58
|
+
final_bam = "#{File.basename(bamfile, '.bam')}.assigned.bam"
|
63
59
|
|
64
60
|
# check for latest files first and create what is needed
|
65
|
-
if !File.exist?(
|
66
|
-
if !File.exist?(
|
67
|
-
|
68
|
-
if !File.exist?(readsorted_bam)
|
69
|
-
if !File.exist?(valid_bam)
|
70
|
-
valid_bam, invalid_bam = split_bam bamfile
|
71
|
-
end
|
72
|
-
readsorted_bam = Samtools.readsort_bam(valid_bam)
|
73
|
-
File.delete valid_bam
|
74
|
-
end
|
75
|
-
assigned_bam = assign_and_quantify readsorted_bam
|
76
|
-
File.delete readsorted_bam
|
77
|
-
end
|
78
|
-
Samtools.merge_bam(invalid_bam, assigned_bam,
|
79
|
-
merged_bam, threads=threads)
|
80
|
-
File.delete invalid_bam
|
81
|
-
File.delete assigned_bam
|
61
|
+
if !File.exist?(final_bam)
|
62
|
+
if !File.exist?(assigned_bam)
|
63
|
+
assigned_bam = assign_and_quantify bamfile
|
82
64
|
end
|
83
|
-
|
84
|
-
File.delete merged_bam
|
65
|
+
File.rename(assigned_bam, final_bam)
|
85
66
|
end
|
86
|
-
|
87
67
|
# analyse the final mappings
|
88
|
-
analyse_read_mappings(
|
68
|
+
analyse_read_mappings(final_bam, insertsize, insertsd, true)
|
89
69
|
|
90
70
|
@has_run = true
|
91
71
|
end
|
@@ -133,31 +113,12 @@ module Transrate
|
|
133
113
|
read_length
|
134
114
|
end
|
135
115
|
|
136
|
-
def split_bam bamfile
|
137
|
-
base = File.basename(bamfile, '.bam')
|
138
|
-
valid = "#{base}.valid.bam"
|
139
|
-
invalid = "#{base}.invalid.bam"
|
140
|
-
if !File.exist? valid
|
141
|
-
cmd = "#{@bam_splitter} #{bamfile}"
|
142
|
-
splitter = Cmd.new cmd
|
143
|
-
splitter.run
|
144
|
-
if !splitter.status.success?
|
145
|
-
raise StandardError.new "Couldn't split bam file: #{bamfile}" +
|
146
|
-
"\n#{splitter.stdout}\n#{splitter.stderr}"
|
147
|
-
end
|
148
|
-
end
|
149
|
-
if !File.exist? valid
|
150
|
-
raise StandardError.new "Splitting failed to create valid bam: #{valid}"
|
151
|
-
end
|
152
|
-
[valid, invalid]
|
153
|
-
end
|
154
|
-
|
155
116
|
def assign_and_quantify bamfile
|
156
|
-
|
117
|
+
@salmon.run(@assembly, bamfile)
|
157
118
|
end
|
158
119
|
|
159
|
-
def analyse_expression
|
160
|
-
|
120
|
+
def analyse_expression salmon_output
|
121
|
+
salmon_output.each_pair do |name, expr|
|
161
122
|
contig_name = Bio::FastaDefline.new(name.to_s).entry_id
|
162
123
|
contig = @assembly[contig_name]
|
163
124
|
if expr[:eff_len]==0
|
@@ -190,14 +151,14 @@ module Transrate
|
|
190
151
|
end
|
191
152
|
@bad = @fragments_mapped - @good
|
192
153
|
else
|
193
|
-
raise "couldn't find bamfile: #{bamfile}"
|
154
|
+
raise TransrateError.new "couldn't find bamfile: #{bamfile}"
|
194
155
|
end
|
195
|
-
|
156
|
+
salmon_results = "#{File.basename @assembly.file}_quant.sf"
|
196
157
|
|
197
|
-
if File.exist?(
|
198
|
-
analyse_expression(@
|
158
|
+
if File.exist?(salmon_results)
|
159
|
+
analyse_expression(@salmon.load_expression(salmon_results))
|
199
160
|
else
|
200
|
-
abort "Can't find #{
|
161
|
+
abort "Can't find #{salmon_results}"
|
201
162
|
end
|
202
163
|
@assembly.assembly.each_pair do |name, contig|
|
203
164
|
@contigs_good += 1 if contig.score >= 0.5
|
@@ -228,7 +189,7 @@ module Transrate
|
|
228
189
|
if !reader.status.success?
|
229
190
|
msg = "Couldn't get information from bam file: #{bamfile}\n"
|
230
191
|
msg << "#{reader.stdout}\n#{reader.stderr}"
|
231
|
-
raise msg
|
192
|
+
raise TransrateError.new msg
|
232
193
|
end
|
233
194
|
end
|
234
195
|
end
|
@@ -236,8 +197,7 @@ module Transrate
|
|
236
197
|
def populate_contig_data row
|
237
198
|
name = Bio::FastaDefline.new(row[:name].to_s).entry_id
|
238
199
|
contig = @assembly[name]
|
239
|
-
|
240
|
-
contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
|
200
|
+
contig.p_seq_true = row[:p_seq_true]
|
241
201
|
contig.uncovered_bases = row[:bases_uncovered]
|
242
202
|
@bases_uncovered += contig.uncovered_bases
|
243
203
|
if row[:fragments_mapped] and row[:fragments_mapped] > 1
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Transrate
|
2
|
+
|
3
|
+
class SalmonError < TransrateError
|
4
|
+
end
|
5
|
+
|
6
|
+
class Salmon
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
which = Cmd.new('which salmon')
|
10
|
+
which.run
|
11
|
+
if !which.status.success?
|
12
|
+
raise SalmonError.new("could not find salmon in the path")
|
13
|
+
end
|
14
|
+
@salmon = which.stdout.split("\n").first
|
15
|
+
end
|
16
|
+
|
17
|
+
def run assembly, bamfile, threads=8
|
18
|
+
assembly = assembly.file if assembly.is_a? Assembly
|
19
|
+
output = "quant.sf"
|
20
|
+
@fin_output = "#{File.basename assembly}_#{output}"
|
21
|
+
unless File.exist? @fin_output
|
22
|
+
salmon = Cmd.new build_command(assembly, bamfile, threads)
|
23
|
+
salmon.run
|
24
|
+
unless salmon.status.success?
|
25
|
+
logger.error salmon.stderr
|
26
|
+
raise SalmonError.new("Salmon failed")
|
27
|
+
end
|
28
|
+
File.rename(output, @fin_output)
|
29
|
+
end
|
30
|
+
return 'postSample.bam'
|
31
|
+
end
|
32
|
+
|
33
|
+
def build_command assembly, bamfile, threads=4
|
34
|
+
cmd = "#{@salmon} quant"
|
35
|
+
cmd << " --libType IU"
|
36
|
+
cmd << " --alignments #{bamfile}"
|
37
|
+
cmd << " --targets #{assembly}"
|
38
|
+
cmd << " --threads #{threads}"
|
39
|
+
cmd << " --sampleOut"
|
40
|
+
cmd << " --sampleUnaligned" # thanks Rob!
|
41
|
+
cmd << " --output ."
|
42
|
+
cmd
|
43
|
+
end
|
44
|
+
|
45
|
+
def load_expression file
|
46
|
+
expression = {}
|
47
|
+
File.open(file).each do |line|
|
48
|
+
if line !~ /^#/
|
49
|
+
line = line.chomp.split("\t")
|
50
|
+
target = line[0]
|
51
|
+
effective_length = line[1]
|
52
|
+
effective_count = line[4]
|
53
|
+
tpm = line[2]
|
54
|
+
expression[target] = {
|
55
|
+
:eff_len => effective_length.to_i,
|
56
|
+
:eff_count => effective_count.to_f,
|
57
|
+
:tpm => tpm.to_f
|
58
|
+
}
|
59
|
+
end
|
60
|
+
end
|
61
|
+
expression
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
data/lib/transrate/snap.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Transrate
|
2
2
|
|
3
|
-
class SnapError <
|
3
|
+
class SnapError < TransrateError
|
4
4
|
end
|
5
5
|
|
6
6
|
class Snap
|
@@ -8,7 +8,7 @@ module Transrate
|
|
8
8
|
require 'fix-trinity-output'
|
9
9
|
require 'bio'
|
10
10
|
|
11
|
-
attr_reader :index_name, :
|
11
|
+
attr_reader :index_name, :bam, :read_count
|
12
12
|
|
13
13
|
def initialize
|
14
14
|
which_snap = Cmd.new('which snap')
|
@@ -27,10 +27,6 @@ module Transrate
|
|
27
27
|
l.split(",").zip(r.split(",")).each do |left, right|
|
28
28
|
cmd << " #{left} #{right}"
|
29
29
|
end
|
30
|
-
# NOTE: do NOT turn on the -so flag (sort bam output)
|
31
|
-
# it violates the basic assumption of eXpress's streaming
|
32
|
-
# algorithm: that the fragments are observed in approximately
|
33
|
-
# random order.
|
34
30
|
cmd << " -o #{@bam}"
|
35
31
|
cmd << " -s 0 1000" # min and max distance between paired-read starts
|
36
32
|
cmd << " -H 300000" # max seed hits to consider in paired mode
|
@@ -39,7 +35,7 @@ module Transrate
|
|
39
35
|
cmd << " -t #{threads}"
|
40
36
|
cmd << " -b" # bind threads to cores
|
41
37
|
cmd << " -M" # format cigar string
|
42
|
-
cmd << " -D 5" # edit distance to search for
|
38
|
+
cmd << " -D 5" # extra edit distance to search. needed for -om
|
43
39
|
cmd << " -om 5" # Output multiple alignments. extra edit distance
|
44
40
|
cmd << " -omax 10" # max alignments per pair/read
|
45
41
|
cmd
|
@@ -63,6 +59,7 @@ module Transrate
|
|
63
59
|
save_readcount runner.stdout
|
64
60
|
unless runner.status.success?
|
65
61
|
if runner.stderr=~/Unmatched\sread\sIDs/
|
62
|
+
logger.warn runner.stderr
|
66
63
|
logger.warn "Unmatched read IDs. Fixing input files..."
|
67
64
|
remap_reads(left, right, threads)
|
68
65
|
else
|
@@ -134,7 +131,6 @@ module Transrate
|
|
134
131
|
|
135
132
|
def build_index file, threads
|
136
133
|
@index_name = File.basename(file, File.extname(file))
|
137
|
-
file = check_ambiguous(file)
|
138
134
|
unless Dir.exists?(@index_name)
|
139
135
|
cmd = "#{@snap} index #{file} #{@index_name}"
|
140
136
|
cmd << " -s 23"
|
@@ -151,30 +147,6 @@ module Transrate
|
|
151
147
|
@index_built = true
|
152
148
|
end
|
153
149
|
|
154
|
-
def check_ambiguous file
|
155
|
-
|
156
|
-
ref = Bio::FastaFormat.open(file)
|
157
|
-
ambiguous = false
|
158
|
-
fixed = ""
|
159
|
-
ref.each do |entry|
|
160
|
-
seq = entry.seq
|
161
|
-
if seq =~ /[RYSWKMBDHV]/
|
162
|
-
seq = seq.gsub(/[RYSWKMBDHV]/, "N")
|
163
|
-
ambiguous = true
|
164
|
-
end
|
165
|
-
fixed << ">#{entry.definition}\n#{seq}\n"
|
166
|
-
end
|
167
|
-
ref.close
|
168
|
-
if ambiguous
|
169
|
-
logger.warn "squelching ambiguous nucleotides"
|
170
|
-
file = "#{File.basename(file, File.extname(file))}.fixed.fasta"
|
171
|
-
File.open(file, "w") do |out|
|
172
|
-
out.write fixed
|
173
|
-
end
|
174
|
-
end
|
175
|
-
return file
|
176
|
-
end
|
177
|
-
|
178
150
|
end # Snap
|
179
151
|
|
180
152
|
end # Transrate
|
data/lib/transrate/transrater.rb
CHANGED
data/lib/transrate/version.rb
CHANGED
data/lib/transrate.rb
CHANGED
@@ -3,25 +3,15 @@ require 'rbconfig'
|
|
3
3
|
require 'yell'
|
4
4
|
RbConfig::CONFIG['CFLAGS'] = ''
|
5
5
|
|
6
|
-
require 'transrate/transrater'
|
7
|
-
require 'transrate/version'
|
8
|
-
require 'transrate/contig'
|
9
|
-
require 'transrate/assembly'
|
10
|
-
require 'transrate/snap'
|
11
|
-
require 'transrate/score_optimiser'
|
12
|
-
require 'transrate/express'
|
13
|
-
require 'transrate/read_metrics'
|
14
|
-
require 'transrate/comparative_metrics'
|
15
|
-
require 'transrate/contig_metrics'
|
16
|
-
require 'transrate/samtools'
|
17
|
-
require 'transrate/cmd'
|
18
|
-
require 'transrate/sam_checker'
|
19
|
-
require 'transrate/transrate.so'
|
20
|
-
|
21
6
|
# Transrate is a comprehensive transcriptome assembly
|
22
7
|
# quality assessment tool.
|
23
8
|
module Transrate
|
24
9
|
|
10
|
+
# Our own set of errors to allow nice custom error handling
|
11
|
+
class TransrateError < StandardError; end
|
12
|
+
class TransrateIOError < TransrateError; end
|
13
|
+
class TransrateArgError < TransrateError; end
|
14
|
+
|
25
15
|
# Create the universal logger and include it in Object
|
26
16
|
# making the logger object available everywhere
|
27
17
|
format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
|
@@ -35,3 +25,16 @@ module Transrate
|
|
35
25
|
Object.send :include, Yell::Loggable
|
36
26
|
|
37
27
|
end # Transrate
|
28
|
+
|
29
|
+
require 'transrate/transrater'
|
30
|
+
require 'transrate/version'
|
31
|
+
require 'transrate/contig'
|
32
|
+
require 'transrate/assembly'
|
33
|
+
require 'transrate/snap'
|
34
|
+
require 'transrate/score_optimiser'
|
35
|
+
require 'transrate/salmon'
|
36
|
+
require 'transrate/read_metrics'
|
37
|
+
require 'transrate/comparative_metrics'
|
38
|
+
require 'transrate/contig_metrics'
|
39
|
+
require 'transrate/cmd'
|
40
|
+
require 'transrate/transrate.so'
|