transfuse 0.1.4 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +42 -1
- data/Rakefile +10 -4
- data/bin/transfuse +12 -11
- data/deps/deps.yaml +11 -0
- data/lib/transfuse/cluster.rb +53 -68
- data/lib/transfuse/cmd.rb +1 -1
- data/lib/transfuse/consensus.rb +105 -0
- data/lib/transfuse/transfuse.rb +108 -88
- data/lib/transfuse/version.rb +2 -2
- data/lib/transfuse.rb +5 -4
- data/test/test_transfuse.rb +38 -40
- data/transfuse.gemspec +1 -1
- metadata +5 -5
- data/Gemfile.lock +0 -87
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03b84ea2236b7c43ce321f64352bb6cb7b7035d4
|
4
|
+
data.tar.gz: ed70d9a9c204a06b550a7b95c01c74afdcb98b51
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6074f06d8afd9e33fce7b77e0221ffc8a300da966cc7916f86eb245d02c1e2b5519a831e1c99962fcfd5b085b5cd3d94765297ec94469270e4246c6c9ab71830
|
7
|
+
data.tar.gz: d992bac7cb1a98cd523e4df28d5b684f42bf8d8d7deb7882d42ecd8bb50bb137a190200006c7528f834d9e30a214b1f304a46b8502176993e22b019658c19061
|
data/README.md
CHANGED
@@ -1 +1,42 @@
|
|
1
|
-
Transfuse
|
1
|
+
## Transfuse
|
2
|
+
|
3
|
+
**Transfuse is currently in development and is not yet ready for use**
|
4
|
+
|
5
|
+
Transfuse intelligently merges your multiple de novo transcriptome assemblies. Run multiple assemblies with different de novo assemblers, or different settings in the same assembler and have them combined into a single high quality transcriptome.
|
6
|
+
|
7
|
+
Transfuse takes in the reads you used to do the assembly and a list of fasta files and produces a single output fasta file.
|
8
|
+
|
9
|
+
### Installation and Running
|
10
|
+
|
11
|
+
To install Transfuse, clone this repo:
|
12
|
+
|
13
|
+
`git clone https://github.com/cboursnell/transfuse.git`
|
14
|
+
|
15
|
+
Then build and install the ruby gem
|
16
|
+
|
17
|
+
`gem build *spec; gem install *gem`
|
18
|
+
|
19
|
+
### Usage
|
20
|
+
|
21
|
+
Transfuse is run on the command line. The options are:
|
22
|
+
|
23
|
+
```
|
24
|
+
-a, --assembly=<s> assembly files in FASTA format, comma-separated
|
25
|
+
-l, --left=<s> left reads file in FASTQ format
|
26
|
+
-r, --right=<s> right reads file in FASTQ format
|
27
|
+
-o, --output=<s> write merged assembly to file
|
28
|
+
-t, --threads=<i> number of threads (default: 1)
|
29
|
+
-v, --verbose be verbose
|
30
|
+
-e, --version Print version and exit
|
31
|
+
-h, --help Show this message
|
32
|
+
```
|
33
|
+
|
34
|
+
An example command:
|
35
|
+
|
36
|
+
`transfuse --assembly soap-k31.fa,soap-k41.fa,soap-k51.fa --left reads_1.fq --right reads_2.fq --output soap-merged.fa --threads 12`
|
37
|
+
|
38
|
+
### License
|
39
|
+
|
40
|
+
This is adademic software - please cite us if you use it in your work.
|
41
|
+
|
42
|
+
Transfuse is released under the MIT license.
|
data/Rakefile
CHANGED
@@ -5,15 +5,21 @@ Rake::TestTask.new do |t|
|
|
5
5
|
end
|
6
6
|
|
7
7
|
Rake::TestTask.new do |t|
|
8
|
-
t.name = :
|
8
|
+
t.name = :cluster
|
9
9
|
t.libs << 'test'
|
10
|
-
t.test_files = ['test/
|
10
|
+
t.test_files = ['test/test_cluster.rb']
|
11
11
|
end
|
12
12
|
|
13
13
|
Rake::TestTask.new do |t|
|
14
|
-
t.name = :
|
14
|
+
t.name = :fuse
|
15
15
|
t.libs << 'test'
|
16
|
-
t.test_files = ['test/
|
16
|
+
t.test_files = ['test/test_transfuse.rb']
|
17
|
+
end
|
18
|
+
|
19
|
+
Rake::TestTask.new do |t|
|
20
|
+
t.name = :cons
|
21
|
+
t.libs << 'test'
|
22
|
+
t.test_files = ['test/test_consensus.rb']
|
17
23
|
end
|
18
24
|
|
19
25
|
desc "Run tests"
|
data/bin/transfuse
CHANGED
@@ -22,23 +22,24 @@ opts = Trollop::options do
|
|
22
22
|
OPTIONS:
|
23
23
|
|
24
24
|
EOS
|
25
|
-
opt :
|
25
|
+
opt :assemblies, "assembly files in FASTA format, comma-separated",
|
26
26
|
:type => String, :required => true
|
27
|
-
opt :scores, "transrate contig score output files, comma-separated",
|
28
|
-
:type => String
|
29
27
|
opt :left, "left reads file in FASTQ format",
|
30
28
|
:type => String
|
31
29
|
opt :right, "right reads file in FASTQ format",
|
32
30
|
:type => String
|
31
|
+
opt :scores, "transrate contig score output files, comma-separated. Ignored if reads are provided",
|
32
|
+
:type => String
|
33
33
|
opt :output, "write merged assembly to file",
|
34
34
|
:type => String, :required => :true
|
35
35
|
opt :threads, "number of threads", :type => :int, :default => 1
|
36
|
+
opt :id, "sequence identity to cluster at", :type => :float, :default => 1.0
|
36
37
|
opt :verbose, "be verbose"
|
37
38
|
end
|
38
39
|
|
39
40
|
transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
|
40
41
|
|
41
|
-
assembly_files = transfuse.check_files opts.
|
42
|
+
assembly_files = transfuse.check_files opts.assemblies
|
42
43
|
score_files = transfuse.check_files opts.score if opts.score
|
43
44
|
left = transfuse.check_files opts.left if opts.left
|
44
45
|
right = transfuse.check_files opts.right if opts.right
|
@@ -54,6 +55,7 @@ else
|
|
54
55
|
abort msg
|
55
56
|
end
|
56
57
|
|
58
|
+
# filter out assemblies with low score
|
57
59
|
assembly_files = transfuse.filter assembly_files, scores
|
58
60
|
|
59
61
|
# concatenate assemblies into one fasta file
|
@@ -62,12 +64,11 @@ cat = transfuse.concatenate assembly_files
|
|
62
64
|
# load fasta sequences from concatenated file into hash
|
63
65
|
transfuse.load_fasta cat
|
64
66
|
|
65
|
-
# cluster using vsearch
|
66
|
-
|
67
|
-
|
68
|
-
transfuse.sequence_alignment clusters
|
69
|
-
# pull out contigs from each cluster based on the scores
|
70
|
-
# best = transfuse.select_contigs clusters, scores
|
67
|
+
# cluster using vsearch
|
68
|
+
msa = transfuse.cluster cat, opts.id
|
71
69
|
|
72
|
-
#
|
70
|
+
# read the msa from vsearch and produce a consensus fasta
|
71
|
+
cons = transfuse.consensus msa, scores, opts.output
|
73
72
|
|
73
|
+
# transrate the consensus output to remove low scoring contigs
|
74
|
+
transfuse.transrate_consensus cons, opts.output, left, right
|
data/deps/deps.yaml
CHANGED
@@ -0,0 +1,11 @@
|
|
1
|
+
vsearch:
|
2
|
+
binaries:
|
3
|
+
- vsearch
|
4
|
+
version:
|
5
|
+
number: '1.1.3'
|
6
|
+
command: 'vsearch --version'
|
7
|
+
url:
|
8
|
+
64bit:
|
9
|
+
linux: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-linux-x86_64
|
10
|
+
macosx: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-osx-x86_64
|
11
|
+
unpack: false
|
data/lib/transfuse/cluster.rb
CHANGED
@@ -5,61 +5,31 @@ module Transfuse
|
|
5
5
|
|
6
6
|
class Cluster
|
7
7
|
|
8
|
-
def initialize threads, verbose
|
9
|
-
@cdhit = Which::which('cd-hit-est').first
|
10
|
-
raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
|
8
|
+
def initialize threads, verbose, id
|
11
9
|
@vsearch = Which::which('vsearch').first
|
12
10
|
raise "vsearch was not in the PATH - please install it" unless @vsearch
|
13
|
-
@id =
|
11
|
+
@id = id.to_s
|
14
12
|
@threads = threads
|
15
13
|
@verbose = verbose
|
16
14
|
end
|
17
15
|
|
18
16
|
def run fasta
|
19
|
-
|
20
|
-
|
21
|
-
output = cd_hit fasta
|
22
|
-
return parse_output output
|
23
|
-
else
|
24
|
-
output = vsearch fasta
|
25
|
-
return parse_vsearch_output output
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def cd_hit fasta
|
30
|
-
puts "running cd-hit-est" if @verbose
|
31
|
-
output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
|
32
|
-
cdhit_cmd = generate_cdhit_command fasta, output
|
33
|
-
puts cdhit_cmd if @verbose
|
34
|
-
cluster = Cmd.new cdhit_cmd
|
35
|
-
cluster.run output
|
36
|
-
return "#{output}.clstr"
|
17
|
+
cluster_output, msa_output = vsearch fasta
|
18
|
+
return parse_vsearch_output(cluster_output, msa_output)
|
37
19
|
end
|
38
20
|
|
39
21
|
def vsearch fasta
|
40
|
-
|
41
|
-
cluster_output = "#{fasta}.clust"
|
42
|
-
|
22
|
+
print "running vsearch..." if @verbose
|
23
|
+
cluster_output = "#{File.basename(fasta)}-#{@id}.clust"
|
24
|
+
msa_output = "#{File.basename(fasta)}-#{@id}.aln"
|
25
|
+
vsearch_cmd = generate_vsearch_command fasta, cluster_output, msa_output
|
43
26
|
cluster = Cmd.new vsearch_cmd
|
44
27
|
cluster.run cluster_output
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
def generate_cdhit_command fasta, out
|
49
|
-
#cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
|
50
|
-
cmd = "#{@cdhit}"
|
51
|
-
cmd << " -i #{fasta}"
|
52
|
-
cmd << " -o #{out}"
|
53
|
-
cmd << " -c #{@id}" # similarity = number of identical bases /
|
54
|
-
# length of shorter sequences
|
55
|
-
cmd << " -T #{@threads}"
|
56
|
-
cmd << " -n 10" # word length - maybe increase??
|
57
|
-
cmd << " -d 100" # output name width
|
58
|
-
cmd << " -g 1" # slower but more accurate mode
|
59
|
-
cmd << " -M 8000" # increase memory
|
28
|
+
puts " Done. Created #{cluster_output}" if @verbose
|
29
|
+
return [cluster_output, msa_output]
|
60
30
|
end
|
61
31
|
|
62
|
-
def generate_vsearch_command fasta, out
|
32
|
+
def generate_vsearch_command fasta, out, msa
|
63
33
|
vsearch = "#{@vsearch}"
|
64
34
|
vsearch << " --cluster_fast #{fasta}"
|
65
35
|
vsearch << " --id #{@id}"
|
@@ -67,45 +37,60 @@ module Transfuse
|
|
67
37
|
vsearch << " --qmask none" # no masking
|
68
38
|
vsearch << " --strand both"
|
69
39
|
vsearch << " --uc #{out}"
|
40
|
+
vsearch << " --msaout #{msa}"
|
70
41
|
vsearch << " --threads #{@threads}"
|
71
42
|
return vsearch
|
72
43
|
end
|
73
44
|
|
74
|
-
def
|
75
|
-
|
76
|
-
cluster_id = 0
|
77
|
-
clusters = {}
|
78
|
-
File.open(cluster_output).each_line do |line|
|
79
|
-
if line =~ />Cluster\ ([0-9]+)/
|
80
|
-
cluster_id = $1.to_i
|
81
|
-
elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
|
82
|
-
contig_name = $1
|
83
|
-
strand = $2
|
84
|
-
id = $3.to_f
|
85
|
-
clusters[cluster_id] ||= []
|
86
|
-
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
87
|
-
elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
|
88
|
-
contig_name = $1
|
89
|
-
strand = "+"
|
90
|
-
clusters[cluster_id] ||= []
|
91
|
-
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
92
|
-
end
|
93
|
-
end
|
94
|
-
return clusters
|
95
|
-
end
|
96
|
-
|
97
|
-
def parse_vsearch_output cluster_output
|
45
|
+
def parse_vsearch_output cluster_output, msa_output
|
46
|
+
print "parsing vsearch output" if @verbose
|
98
47
|
clusters = {}
|
48
|
+
lookup = {}
|
49
|
+
second = 0
|
50
|
+
count = 0
|
99
51
|
File.open(cluster_output).each_line do |line|
|
52
|
+
count+=1
|
100
53
|
if line.start_with?("S") or line.start_with?("H")
|
101
54
|
cols = line.chomp.split("\t")
|
102
|
-
cluster = cols[1]
|
55
|
+
cluster = cols[1]
|
56
|
+
len = cols[2].to_i
|
57
|
+
cigar = cols[7]
|
58
|
+
strand = cols[4]
|
59
|
+
strand = "+" if strand == "*"
|
103
60
|
contig_name = cols[8]
|
61
|
+
|
104
62
|
clusters[cluster] ||= []
|
105
|
-
clusters[cluster] << contig_name
|
63
|
+
clusters[cluster] << { :name => contig_name, :strand => strand }
|
64
|
+
lookup[contig_name] = cluster
|
65
|
+
end
|
66
|
+
if count%10_000==0 and @verbose
|
67
|
+
print "."
|
106
68
|
end
|
107
69
|
end
|
108
|
-
|
70
|
+
puts " Done" if @verbose
|
71
|
+
print "parsing msa output " if @verbose
|
72
|
+
count = 0
|
73
|
+
msa = {}
|
74
|
+
Bio::FastaFormat.open(msa_output).each do |entry|
|
75
|
+
count += 1
|
76
|
+
name = entry.entry_id
|
77
|
+
if name != "consensus"
|
78
|
+
# name = name[1..-1]
|
79
|
+
if name[0]=="*"
|
80
|
+
name = name[1..-1]
|
81
|
+
end
|
82
|
+
# what cluster is name in?
|
83
|
+
cluster = lookup[name]
|
84
|
+
msa[cluster] ||= []
|
85
|
+
msa[cluster] << { :name => name, :seq => entry.seq.seq }
|
86
|
+
end
|
87
|
+
if count%10_000==0 and @verbose
|
88
|
+
print "."
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
puts " Done" if @verbose
|
93
|
+
return msa
|
109
94
|
end
|
110
95
|
|
111
96
|
end
|
data/lib/transfuse/cmd.rb
CHANGED
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
require 'bio'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Transfuse
|
6
|
+
|
7
|
+
class Consensus
|
8
|
+
|
9
|
+
attr_reader :contigs
|
10
|
+
|
11
|
+
def initialize verbose
|
12
|
+
@verbose = verbose
|
13
|
+
end
|
14
|
+
|
15
|
+
def run msa, scores, output
|
16
|
+
return 1 if File.exist?(output)
|
17
|
+
print "writing consensus " if @verbose
|
18
|
+
# msa is a hash
|
19
|
+
# key = cluster id
|
20
|
+
# value = list
|
21
|
+
# list of sequences in cluster aligned with gaps
|
22
|
+
preoutput = "#{File.basename(output, File.extname(output))}_cons.fa"
|
23
|
+
count = 0
|
24
|
+
File.open("#{output}.data", "w") do |out2|
|
25
|
+
File.open(preoutput, "w") do |out|
|
26
|
+
msa.each do |id, list|
|
27
|
+
count+=1
|
28
|
+
print "." if count%5_000==0 and @verbose
|
29
|
+
exons={}
|
30
|
+
cons = []
|
31
|
+
length = list[0][:seq].length
|
32
|
+
list.each_with_index do |hash, index|
|
33
|
+
seq = hash[:seq]
|
34
|
+
name = hash[:name]
|
35
|
+
out2.write "#{id}\t#{scores[name][:score]}\t#{name}\n"
|
36
|
+
prev = ""
|
37
|
+
gap = 0
|
38
|
+
exon = 0
|
39
|
+
seq.each_char do |c|
|
40
|
+
if c=="-"
|
41
|
+
base="-"
|
42
|
+
else
|
43
|
+
base="*"
|
44
|
+
end
|
45
|
+
if base!=prev
|
46
|
+
if c=="-"
|
47
|
+
gap+=1
|
48
|
+
else
|
49
|
+
exon+=1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
if c=="-"
|
53
|
+
prev = "-"
|
54
|
+
else
|
55
|
+
prev = "*"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
exons[index] = exon
|
59
|
+
end
|
60
|
+
|
61
|
+
consensus = ""
|
62
|
+
0.upto(length-1) do |i|
|
63
|
+
base="N"
|
64
|
+
counts = {}
|
65
|
+
list.each_with_index do |hash, index|
|
66
|
+
seq = hash[:seq]
|
67
|
+
if seq[i] != "-" and seq[i] != "N"
|
68
|
+
counts[seq[i]]||=0
|
69
|
+
counts[seq[i]] += 1
|
70
|
+
if exons[index]==1
|
71
|
+
base = seq[i]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
if counts.size>0
|
76
|
+
base = counts.sort.last.first
|
77
|
+
end
|
78
|
+
consensus << base
|
79
|
+
end
|
80
|
+
|
81
|
+
if consensus.count("N") < consensus.length.to_f*0.5
|
82
|
+
cons << consensus
|
83
|
+
end
|
84
|
+
|
85
|
+
list.each_with_index do |hash, index|
|
86
|
+
if exons[index] > 1
|
87
|
+
cons << hash[:seq].delete("-")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
cons.each_with_index do |s,index|
|
92
|
+
out.write ">contig#{id}.#{index+1}\n"
|
93
|
+
out.write "#{s}\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
end # msa.each
|
97
|
+
end # file
|
98
|
+
end # file open
|
99
|
+
puts " Done" if @verbose
|
100
|
+
return preoutput
|
101
|
+
end # def
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/transfuse/transfuse.rb
CHANGED
@@ -6,27 +6,28 @@ end
|
|
6
6
|
|
7
7
|
module Transfuse
|
8
8
|
|
9
|
+
require 'bio'
|
9
10
|
require 'csv'
|
10
11
|
require 'transrate'
|
12
|
+
require 'threach'
|
11
13
|
|
12
14
|
class Transfuse
|
13
15
|
|
14
16
|
def initialize threads, verbose
|
15
17
|
@threads = threads
|
16
18
|
@verbose = verbose
|
17
|
-
@clustalo = Which::which('clustalo').first
|
18
|
-
raise "clustalo was not in the PATH - please install it" unless @clustalo
|
19
19
|
end
|
20
20
|
|
21
21
|
def check_files string
|
22
|
+
# puts "check file string: #{string}" if @verbose
|
22
23
|
list = []
|
23
24
|
string.split(",").each do |file|
|
24
25
|
file = File.expand_path(file)
|
25
26
|
if File.exist?(file)
|
26
|
-
puts "#{file} exists" if @verbose
|
27
|
+
puts "#{File.basename(file)} exists" if @verbose
|
27
28
|
list << file
|
28
29
|
else
|
29
|
-
abort "#{file} not found"
|
30
|
+
abort "#{File.basename(file)} not found"
|
30
31
|
end
|
31
32
|
end
|
32
33
|
return list
|
@@ -51,42 +52,27 @@ module Transfuse
|
|
51
52
|
return File.expand_path(catted_fasta)
|
52
53
|
end
|
53
54
|
|
54
|
-
def cluster file
|
55
|
-
puts "clustering #{file}" if @verbose
|
56
|
-
cluster = Cluster.new @threads, @verbose
|
57
|
-
return cluster.run file
|
58
|
-
end
|
59
|
-
|
60
55
|
def load_fasta fasta
|
56
|
+
print "loading fasta sequence #{fasta}..." if @verbose
|
61
57
|
@sequences = {}
|
58
|
+
count = 1
|
62
59
|
Bio::FastaFormat.open(fasta).each do |entry|
|
63
60
|
@sequences[entry.entry_id] = entry.seq.to_s
|
61
|
+
print "." if count%10_000==0 and @verbose
|
62
|
+
count +=1
|
64
63
|
end
|
64
|
+
puts " Done" if @verbose
|
65
65
|
end
|
66
66
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
seq << "#{@sequences[hash[:name]].revcomp}\n"
|
77
|
-
else
|
78
|
-
abort "Unknown strand #{hash[:strand]}"
|
79
|
-
end
|
80
|
-
end
|
81
|
-
cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
|
82
|
-
cmd << "--output-order tree-order"
|
83
|
-
align = Cmd.new cmd
|
84
|
-
align.run
|
85
|
-
File.open("cluster#{id}.fa", "wb") do |out|
|
86
|
-
out.write align.stdout
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
67
|
+
def cluster file, id
|
68
|
+
puts "clustering #{file}" if @verbose
|
69
|
+
cluster = Cluster.new @threads, @verbose, id
|
70
|
+
return cluster.run file
|
71
|
+
end
|
72
|
+
|
73
|
+
def consensus msa, scores, output
|
74
|
+
cons = Consensus.new(@verbose)
|
75
|
+
return cons.run(msa, scores, output)
|
90
76
|
end
|
91
77
|
|
92
78
|
def load_scores files
|
@@ -96,8 +82,10 @@ module Transfuse
|
|
96
82
|
:header_converters => :symbol,
|
97
83
|
:converters => :all) do |row|
|
98
84
|
name = row[:contig_name]
|
99
|
-
|
100
|
-
|
85
|
+
scores[name] = { :score => row[:score].to_f,
|
86
|
+
:p_good => row[:p_good].to_f,
|
87
|
+
:p_bases_covered => row[:p_bases_covered].to_f,
|
88
|
+
:coverage => row[:coverage].to_f }
|
101
89
|
end
|
102
90
|
end
|
103
91
|
return scores
|
@@ -107,13 +95,15 @@ module Transfuse
|
|
107
95
|
filtered_files = []
|
108
96
|
files.each_with_index do |file, index|
|
109
97
|
new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
|
110
|
-
|
98
|
+
if !File.exist?(new_filename) or File.stat(new_filename).size < 1
|
111
99
|
File.open(new_filename, "wb") do |out|
|
112
|
-
puts "
|
100
|
+
puts "filtering #{file}..." if @verbose
|
113
101
|
Bio::FastaFormat.open(file).each do |entry|
|
114
102
|
contig_name = entry.entry_id
|
115
103
|
contig_name = "contig#{index}_#{contig_name}"
|
116
|
-
if scores.key?(contig_name) and
|
104
|
+
if scores.key?(contig_name) and
|
105
|
+
scores[contig_name][:score] > 0.01 and
|
106
|
+
scores[contig_name][:coverage] >= 1
|
117
107
|
out.write ">#{contig_name}\n"
|
118
108
|
out.write "#{entry.seq}\n"
|
119
109
|
elsif !scores.key?(contig_name)
|
@@ -127,75 +117,105 @@ module Transfuse
|
|
127
117
|
return filtered_files
|
128
118
|
end
|
129
119
|
|
120
|
+
def transrate_consensus file, output, left, right
|
121
|
+
output = File.expand_path(output)
|
122
|
+
puts "transrate on #{file}" if @verbose
|
123
|
+
file = File.expand_path(file)
|
124
|
+
name = File.basename(file, File.extname(file))
|
125
|
+
dir = "transrate_#{name}"
|
126
|
+
Dir.mkdir(dir) unless Dir.exist?(dir)
|
127
|
+
Dir.chdir(dir) do
|
128
|
+
assembly = Transrate::Assembly.new(file)
|
129
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
130
|
+
rename = "assembly_#{name}_score_optimisation.csv"
|
131
|
+
rm = transrater.read_metrics(left.join(','), right.join(','))
|
132
|
+
stats = rm.read_stats
|
133
|
+
File.rename("assembly_score_optimisation.csv", rename)
|
134
|
+
scores={}
|
135
|
+
assembly.each do |name, contig|
|
136
|
+
scores[name] = { :score => contig.score.to_f,
|
137
|
+
:p_good => contig.p_good.to_f,
|
138
|
+
:p_bases_covered => contig.p_bases_covered.to_f,
|
139
|
+
:coverage => contig.coverage.to_f }
|
140
|
+
end
|
141
|
+
scores_file = "#{name}_scores.csv"
|
142
|
+
stats_file = "../#{name}_stats.txt"
|
143
|
+
puts " writing scores" if @verbose
|
144
|
+
File.open(scores_file, "wb") do |out|
|
145
|
+
scores.each do |name, hash|
|
146
|
+
out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
|
147
|
+
out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
puts " writing filtered fasta file" if @verbose
|
151
|
+
File.open(output, "wb") do |out|
|
152
|
+
assembly.each do |name, contig|
|
153
|
+
if contig.score.to_f > 0.01 and contig.coverage.to_f >= 1
|
154
|
+
out.write ">#{name}\n"
|
155
|
+
out.write "#{contig.seq.seq}\n"
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
puts " writing stats" if @verbose
|
160
|
+
File.open(stats_file, "wb") do |out|
|
161
|
+
stats.each do |key, value|
|
162
|
+
out.write "#{key}\t#{value}\n"
|
163
|
+
end
|
164
|
+
out.write "assembly score:\t#{transrater.assembly_score}\n"
|
165
|
+
optimal = transrater.assembly_optimal_score("prefix")
|
166
|
+
out.write "optimal score :\t#{optimal[0]}\n"
|
167
|
+
out.write "cutoff :\t#{optimal[1]}\n"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
130
171
|
|
131
172
|
def transrate files, left, right
|
132
173
|
scores = {}
|
133
|
-
|
174
|
+
shortname = ""
|
175
|
+
files.each do |n|
|
176
|
+
shortname << File.basename(n, File.extname(n))[0..4]
|
177
|
+
end
|
178
|
+
scores_file = "#{shortname}_scores.csv"
|
134
179
|
if File.exist?(scores_file)
|
135
180
|
puts "loading scores from file" if @verbose
|
136
181
|
File.open(scores_file).each do |line|
|
137
|
-
name, score = line.chomp.split("\t")
|
138
|
-
scores[name] = score.to_f
|
182
|
+
name, score, p_good, p_bases_covered, coverage = line.chomp.split("\t")
|
183
|
+
scores[name] = { :score => score.to_f,
|
184
|
+
:p_good => p_good.to_f,
|
185
|
+
:p_bases_covered => p_bases_covered.to_f,
|
186
|
+
:coverage => coverage.to_f }
|
139
187
|
end
|
140
188
|
else
|
141
189
|
files.each_with_index do |fasta, index|
|
142
190
|
puts "transrate on #{fasta}" if @verbose
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
191
|
+
dir = "transrate_#{File.basename(fasta, File.extname(fasta))}"
|
192
|
+
Dir.mkdir(dir) unless Dir.exist?(dir)
|
193
|
+
Dir.chdir(dir) do
|
194
|
+
assembly = Transrate::Assembly.new(fasta)
|
195
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
196
|
+
rename = "assembly#{index}_score_optimisation.csv"
|
197
|
+
transrater.read_metrics(left.join(','), right.join(','))
|
198
|
+
File.rename("assembly_score_optimisation.csv", rename)
|
199
|
+
assembly.each do |name, contig|
|
200
|
+
name = "contig#{index}_#{name}"
|
201
|
+
scores[name] = { :score => contig.score.to_f,
|
202
|
+
:p_good => contig.p_good.to_f,
|
203
|
+
:p_bases_covered => contig.p_bases_covered.to_f,
|
204
|
+
:coverage => contig.coverage.to_f }
|
205
|
+
|
206
|
+
end
|
149
207
|
end
|
150
208
|
end
|
151
209
|
File.open(scores_file, "wb") do |out|
|
152
|
-
scores.each do |name,
|
153
|
-
out.write "#{name}\t#{score}\
|
210
|
+
scores.each do |name, hash|
|
211
|
+
out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
|
212
|
+
out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
|
154
213
|
end
|
155
214
|
end
|
156
215
|
end
|
157
216
|
return scores
|
158
217
|
end
|
159
218
|
|
160
|
-
def select_contigs clusters, scores
|
161
|
-
puts "selecting contigs" if @verbose
|
162
|
-
best = []
|
163
|
-
clusters.each do |cluster_id, list|
|
164
|
-
best_score = 0
|
165
|
-
best_contig = ""
|
166
|
-
list.each do |contig_name|
|
167
|
-
unless scores[contig_name]
|
168
|
-
abort "can't find #{contig_name} in scores hash\n"
|
169
|
-
end
|
170
|
-
if scores[contig_name] > best_score
|
171
|
-
best_score = scores[contig_name]
|
172
|
-
best_contig = contig_name
|
173
|
-
end
|
174
|
-
end
|
175
|
-
best << best_contig
|
176
|
-
end
|
177
|
-
return best
|
178
|
-
end
|
179
|
-
|
180
|
-
def output_contigs best, fasta, output
|
181
|
-
puts "writing contigs" if @verbose
|
182
|
-
# read in catted fasta sequences
|
183
|
-
sequences = {}
|
184
|
-
Bio::FastaFormat.open(fasta).each do |entry|
|
185
|
-
sequences[entry.entry_id] = entry.seq
|
186
|
-
end
|
187
|
-
File.open(output, "wb") do |out|
|
188
|
-
best.each do |contig_name|
|
189
|
-
if sequences.key?(contig_name)
|
190
|
-
out.write ">#{contig_name}\n"
|
191
|
-
out.write "#{sequences[contig_name]}\n"
|
192
|
-
else
|
193
|
-
puts "can't find #{contig_name} in #{fasta}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
219
|
end
|
200
220
|
|
201
221
|
end
|
data/lib/transfuse/version.rb
CHANGED
data/lib/transfuse.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
require 'transfuse/cluster
|
2
|
-
require 'transfuse/cmd
|
3
|
-
require 'transfuse/
|
4
|
-
require 'transfuse/
|
1
|
+
require 'transfuse/cluster'
|
2
|
+
require 'transfuse/cmd'
|
3
|
+
require 'transfuse/consensus'
|
4
|
+
require 'transfuse/transfuse'
|
5
|
+
require 'transfuse/version'
|
data/test/test_transfuse.rb
CHANGED
@@ -8,13 +8,13 @@ class TestTransfuse < Test::Unit::TestCase
|
|
8
8
|
context 'transfuse' do
|
9
9
|
|
10
10
|
setup do
|
11
|
-
@fuser = Transfuse::Transfuse.new 4
|
11
|
+
@fuser = Transfuse::Transfuse.new 4, true
|
12
12
|
end
|
13
13
|
|
14
14
|
teardown do
|
15
15
|
end
|
16
16
|
|
17
|
-
should 'check for existence of files' do
|
17
|
+
should '1 check for existence of files' do
|
18
18
|
list = []
|
19
19
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
20
20
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
|
@@ -22,7 +22,7 @@ class TestTransfuse < Test::Unit::TestCase
|
|
22
22
|
assert_equal 2, files.length, "length"
|
23
23
|
end
|
24
24
|
|
25
|
-
should "concatenate two files" do
|
25
|
+
should "2 concatenate two files" do
|
26
26
|
list = []
|
27
27
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
28
28
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
|
@@ -36,71 +36,69 @@ class TestTransfuse < Test::Unit::TestCase
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
-
should "cluster fasta file" do
|
40
|
-
Dir.mktmpdir do |tmpdir|
|
39
|
+
should "3 cluster fasta file" do
|
40
|
+
# Dir.mktmpdir do |tmpdir|
|
41
|
+
tmpdir = Dir.mktmpdir
|
41
42
|
Dir.chdir(tmpdir) do
|
42
43
|
file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
43
44
|
hash = @fuser.cluster file
|
44
45
|
assert_equal 250, hash.size, "output size"
|
45
46
|
end
|
46
|
-
end
|
47
|
+
# end
|
47
48
|
end
|
48
49
|
|
49
|
-
should "load scores from transrate output" do
|
50
|
+
should "4 load scores from transrate output" do
|
50
51
|
files = []
|
51
52
|
files << File.join(File.dirname(__FILE__), 'data', 'contig_scores1.csv')
|
52
53
|
hash = @fuser.load_scores files
|
53
54
|
assert_equal 99, hash.size
|
54
55
|
end
|
55
56
|
|
56
|
-
should "
|
57
|
+
should "5 run transrate on assembly files with reads" do
|
57
58
|
files = []
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
59
|
+
left = []
|
60
|
+
right = []
|
61
|
+
files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
|
62
|
+
left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
|
63
|
+
right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
|
64
|
+
# Dir.mktmpdir do |tmpdir|
|
65
|
+
tmpdir = Dir.mktmpdir
|
66
|
+
Dir.chdir(tmpdir) do
|
67
|
+
scores = @fuser.transrate files, left, right
|
68
|
+
assert_equal 100, scores.size, "scores size"
|
69
|
+
end
|
70
|
+
# end
|
67
71
|
end
|
68
72
|
|
69
|
-
should "
|
73
|
+
should "6 filter contigs" do
|
70
74
|
files = []
|
71
75
|
left = []
|
72
76
|
right = []
|
73
|
-
files << File.join(File.dirname(__FILE__), 'data', '
|
77
|
+
files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
74
78
|
left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
|
75
79
|
right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
|
76
|
-
Dir.mktmpdir do |tmpdir|
|
80
|
+
# Dir.mktmpdir do |tmpdir|
|
81
|
+
tmpdir = Dir.mktmpdir
|
77
82
|
Dir.chdir(tmpdir) do
|
78
83
|
scores = @fuser.transrate files, left, right
|
79
|
-
|
84
|
+
scores.each do |contig, score|
|
85
|
+
# puts "#{contig}\t#{score}"
|
86
|
+
end
|
87
|
+
new_list = @fuser.filter files, scores
|
88
|
+
assert_equal 1, new_list.length
|
89
|
+
cmd = "grep -c \">\" #{new_list.first}"
|
90
|
+
assert_equal 1, `#{cmd}`.chomp.split.first.to_i, "number of contigs"
|
80
91
|
end
|
81
|
-
end
|
92
|
+
# end
|
93
|
+
|
82
94
|
end
|
83
95
|
|
84
|
-
should "
|
85
|
-
|
86
|
-
scores = { "contig1" => 0.2,
|
87
|
-
"contig2" => 0.3,
|
88
|
-
"contig3" => 0.4,
|
89
|
-
"contig4" => 0.2 }
|
90
|
-
best = @fuser.select_contigs clusters, scores
|
91
|
-
assert_equal 2, best.size
|
92
|
-
assert_equal "contig2", best[0]
|
93
|
-
assert_equal "contig3", best[1]
|
96
|
+
should "7 get consensus of clusters" do
|
97
|
+
|
94
98
|
end
|
95
99
|
|
96
|
-
should "
|
97
|
-
|
98
|
-
file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
99
|
-
Dir.mktmpdir do |tmpdir|
|
100
|
-
Dir.chdir(tmpdir) do
|
101
|
-
@fuser.output_contigs best, file, "out"
|
102
|
-
end
|
103
|
-
end
|
100
|
+
should "8 not fail when there are duplicated kmers in the input sequences" do
|
101
|
+
|
104
102
|
end
|
105
103
|
|
106
104
|
end
|
data/transfuse.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
|
20
20
|
gem.add_dependency 'fixwhich', '~> 1.0', '>= 1.0.2'
|
21
21
|
gem.add_dependency 'bindeps', '~> 1.0', '>= 1.0.1'
|
22
|
-
gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.
|
22
|
+
gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.1'
|
23
23
|
|
24
24
|
gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
25
25
|
gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: transfuse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Smith-Unna
|
@@ -94,7 +94,7 @@ dependencies:
|
|
94
94
|
version: '1.0'
|
95
95
|
- - ">="
|
96
96
|
- !ruby/object:Gem::Version
|
97
|
-
version: 1.0.
|
97
|
+
version: 1.0.1
|
98
98
|
type: :runtime
|
99
99
|
prerelease: false
|
100
100
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -104,7 +104,7 @@ dependencies:
|
|
104
104
|
version: '1.0'
|
105
105
|
- - ">="
|
106
106
|
- !ruby/object:Gem::Version
|
107
|
-
version: 1.0.
|
107
|
+
version: 1.0.1
|
108
108
|
- !ruby/object:Gem::Dependency
|
109
109
|
name: rake
|
110
110
|
requirement: !ruby/object:Gem::Requirement
|
@@ -210,7 +210,6 @@ extra_rdoc_files: []
|
|
210
210
|
files:
|
211
211
|
- ".gitignore"
|
212
212
|
- Gemfile
|
213
|
-
- Gemfile.lock
|
214
213
|
- README.md
|
215
214
|
- Rakefile
|
216
215
|
- bin/transfuse
|
@@ -218,6 +217,7 @@ files:
|
|
218
217
|
- lib/transfuse.rb
|
219
218
|
- lib/transfuse/cluster.rb
|
220
219
|
- lib/transfuse/cmd.rb
|
220
|
+
- lib/transfuse/consensus.rb
|
221
221
|
- lib/transfuse/transfuse.rb
|
222
222
|
- lib/transfuse/version.rb
|
223
223
|
- notes.md
|
@@ -248,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
248
248
|
version: '0'
|
249
249
|
requirements: []
|
250
250
|
rubyforge_project:
|
251
|
-
rubygems_version: 2.
|
251
|
+
rubygems_version: 2.2.2
|
252
252
|
signing_key:
|
253
253
|
specification_version: 4
|
254
254
|
summary: Merge assemblies
|
data/Gemfile.lock
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
transfuse (0.1.1)
|
5
|
-
bindeps (~> 1.0, >= 1.0.1)
|
6
|
-
bio (~> 1.4, >= 1.4.3)
|
7
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
8
|
-
transrate (= 1.0.0.beta3)
|
9
|
-
trollop (~> 2.0)
|
10
|
-
|
11
|
-
GEM
|
12
|
-
remote: https://rubygems.org/
|
13
|
-
specs:
|
14
|
-
ansi (1.5.0)
|
15
|
-
bindeps (1.1.2)
|
16
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
17
|
-
bio (1.4.3.0001)
|
18
|
-
coveralls (0.8.1)
|
19
|
-
json (~> 1.8)
|
20
|
-
rest-client (>= 1.6.8, < 2)
|
21
|
-
simplecov (~> 0.10.0)
|
22
|
-
term-ansicolor (~> 1.3)
|
23
|
-
thor (~> 0.19.1)
|
24
|
-
crb-blast (0.6.4)
|
25
|
-
bindeps (~> 1.0, >= 1.0.3)
|
26
|
-
bio (~> 1.4, >= 1.4.3)
|
27
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
28
|
-
threach (~> 0.2, >= 0.2.0)
|
29
|
-
trollop (~> 2.0)
|
30
|
-
docile (1.1.5)
|
31
|
-
domain_name (0.5.24)
|
32
|
-
unf (>= 0.0.5, < 1.0.0)
|
33
|
-
facade (1.0.6)
|
34
|
-
fix-trinity-output (1.0.0)
|
35
|
-
trollop (~> 2.0)
|
36
|
-
fixwhich (1.0.2)
|
37
|
-
pathname2 (~> 1.4, >= 1.4.4)
|
38
|
-
http-cookie (1.0.2)
|
39
|
-
domain_name (~> 0.5)
|
40
|
-
json (1.8.3)
|
41
|
-
mime-types (2.6.1)
|
42
|
-
minitest (4.7.5)
|
43
|
-
netrc (0.10.3)
|
44
|
-
pathname2 (1.7.3)
|
45
|
-
facade
|
46
|
-
rake (10.4.2)
|
47
|
-
rest-client (1.8.0)
|
48
|
-
http-cookie (>= 1.0.2, < 2.0)
|
49
|
-
mime-types (>= 1.16, < 3.0)
|
50
|
-
netrc (~> 0.7)
|
51
|
-
shoulda-context (1.2.1)
|
52
|
-
simplecov (0.10.0)
|
53
|
-
docile (~> 1.1.0)
|
54
|
-
json (~> 1.8)
|
55
|
-
simplecov-html (~> 0.10.0)
|
56
|
-
simplecov-html (0.10.0)
|
57
|
-
term-ansicolor (1.3.0)
|
58
|
-
tins (~> 1.0)
|
59
|
-
thor (0.19.1)
|
60
|
-
threach (0.2.0)
|
61
|
-
tins (1.5.2)
|
62
|
-
transrate (1.0.0.beta3)
|
63
|
-
bindeps (~> 1.1, >= 1.1.2)
|
64
|
-
bio (~> 1.4, >= 1.4.3)
|
65
|
-
crb-blast (~> 0.5, >= 0.5.0)
|
66
|
-
fix-trinity-output (~> 1.0, >= 1.0)
|
67
|
-
trollop (~> 2.0, >= 2.0.0)
|
68
|
-
yell (~> 2.0, >= 2.0.4)
|
69
|
-
trollop (2.1.1)
|
70
|
-
turn (0.9.7)
|
71
|
-
ansi
|
72
|
-
minitest (~> 4)
|
73
|
-
unf (0.1.4)
|
74
|
-
unf_ext
|
75
|
-
unf_ext (0.0.7.1)
|
76
|
-
yell (2.0.5)
|
77
|
-
|
78
|
-
PLATFORMS
|
79
|
-
ruby
|
80
|
-
|
81
|
-
DEPENDENCIES
|
82
|
-
coveralls (~> 0.7)
|
83
|
-
rake (~> 10.3, >= 10.3.2)
|
84
|
-
shoulda-context (~> 1.2, >= 1.2.1)
|
85
|
-
simplecov (~> 0.8, >= 0.8.2)
|
86
|
-
transfuse!
|
87
|
-
turn (~> 0.9, >= 0.9.7)
|