transfuse 0.1.4 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -1
- data/Rakefile +10 -4
- data/bin/transfuse +12 -11
- data/deps/deps.yaml +11 -0
- data/lib/transfuse/cluster.rb +53 -68
- data/lib/transfuse/cmd.rb +1 -1
- data/lib/transfuse/consensus.rb +105 -0
- data/lib/transfuse/transfuse.rb +108 -88
- data/lib/transfuse/version.rb +2 -2
- data/lib/transfuse.rb +5 -4
- data/test/test_transfuse.rb +38 -40
- data/transfuse.gemspec +1 -1
- metadata +5 -5
- data/Gemfile.lock +0 -87
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03b84ea2236b7c43ce321f64352bb6cb7b7035d4
|
4
|
+
data.tar.gz: ed70d9a9c204a06b550a7b95c01c74afdcb98b51
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6074f06d8afd9e33fce7b77e0221ffc8a300da966cc7916f86eb245d02c1e2b5519a831e1c99962fcfd5b085b5cd3d94765297ec94469270e4246c6c9ab71830
|
7
|
+
data.tar.gz: d992bac7cb1a98cd523e4df28d5b684f42bf8d8d7deb7882d42ecd8bb50bb137a190200006c7528f834d9e30a214b1f304a46b8502176993e22b019658c19061
|
data/README.md
CHANGED
@@ -1 +1,42 @@
|
|
1
|
-
Transfuse
|
1
|
+
## Transfuse
|
2
|
+
|
3
|
+
**Transfuse is currently in development and is not yet ready for use**
|
4
|
+
|
5
|
+
Transfuse intelligently merges your multiple de novo transcriptome assemblies. Run multiple assemblies with different de novo assemblers, or different settings in the same assembler and have them combined into a single high quality transcriptome.
|
6
|
+
|
7
|
+
Transfuse takes in the reads you used to do the assembly and a list of fasta files and produces a single output fasta file.
|
8
|
+
|
9
|
+
### Installation and Running
|
10
|
+
|
11
|
+
To install Transfuse, clone this repo:
|
12
|
+
|
13
|
+
`git clone https://github.com/cboursnell/transfuse.git`
|
14
|
+
|
15
|
+
Then build and install the ruby gem
|
16
|
+
|
17
|
+
`gem build *spec; gem install *gem`
|
18
|
+
|
19
|
+
### Usage
|
20
|
+
|
21
|
+
Transfuse is run on the command line. The options are:
|
22
|
+
|
23
|
+
```
|
24
|
+
-a, --assembly=<s> assembly files in FASTA format, comma-separated
|
25
|
+
-l, --left=<s> left reads file in FASTQ format
|
26
|
+
-r, --right=<s> right reads file in FASTQ format
|
27
|
+
-o, --output=<s> write merged assembly to file
|
28
|
+
-t, --threads=<i> number of threads (default: 1)
|
29
|
+
-v, --verbose be verbose
|
30
|
+
-e, --version Print version and exit
|
31
|
+
-h, --help Show this message
|
32
|
+
```
|
33
|
+
|
34
|
+
An example command:
|
35
|
+
|
36
|
+
`transfuse --assembly soap-k31.fa,soap-k41.fa,soap-k51.fa --left reads_1.fq --right reads_2.fq --output soap-merged.fa --threads 12`
|
37
|
+
|
38
|
+
### License
|
39
|
+
|
40
|
+
This is adademic software - please cite us if you use it in your work.
|
41
|
+
|
42
|
+
Transfuse is released under the MIT license.
|
data/Rakefile
CHANGED
@@ -5,15 +5,21 @@ Rake::TestTask.new do |t|
|
|
5
5
|
end
|
6
6
|
|
7
7
|
Rake::TestTask.new do |t|
|
8
|
-
t.name = :
|
8
|
+
t.name = :cluster
|
9
9
|
t.libs << 'test'
|
10
|
-
t.test_files = ['test/
|
10
|
+
t.test_files = ['test/test_cluster.rb']
|
11
11
|
end
|
12
12
|
|
13
13
|
Rake::TestTask.new do |t|
|
14
|
-
t.name = :
|
14
|
+
t.name = :fuse
|
15
15
|
t.libs << 'test'
|
16
|
-
t.test_files = ['test/
|
16
|
+
t.test_files = ['test/test_transfuse.rb']
|
17
|
+
end
|
18
|
+
|
19
|
+
Rake::TestTask.new do |t|
|
20
|
+
t.name = :cons
|
21
|
+
t.libs << 'test'
|
22
|
+
t.test_files = ['test/test_consensus.rb']
|
17
23
|
end
|
18
24
|
|
19
25
|
desc "Run tests"
|
data/bin/transfuse
CHANGED
@@ -22,23 +22,24 @@ opts = Trollop::options do
|
|
22
22
|
OPTIONS:
|
23
23
|
|
24
24
|
EOS
|
25
|
-
opt :
|
25
|
+
opt :assemblies, "assembly files in FASTA format, comma-separated",
|
26
26
|
:type => String, :required => true
|
27
|
-
opt :scores, "transrate contig score output files, comma-separated",
|
28
|
-
:type => String
|
29
27
|
opt :left, "left reads file in FASTQ format",
|
30
28
|
:type => String
|
31
29
|
opt :right, "right reads file in FASTQ format",
|
32
30
|
:type => String
|
31
|
+
opt :scores, "transrate contig score output files, comma-separated. Ignored if reads are provided",
|
32
|
+
:type => String
|
33
33
|
opt :output, "write merged assembly to file",
|
34
34
|
:type => String, :required => :true
|
35
35
|
opt :threads, "number of threads", :type => :int, :default => 1
|
36
|
+
opt :id, "sequence identity to cluster at", :type => :float, :default => 1.0
|
36
37
|
opt :verbose, "be verbose"
|
37
38
|
end
|
38
39
|
|
39
40
|
transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
|
40
41
|
|
41
|
-
assembly_files = transfuse.check_files opts.
|
42
|
+
assembly_files = transfuse.check_files opts.assemblies
|
42
43
|
score_files = transfuse.check_files opts.score if opts.score
|
43
44
|
left = transfuse.check_files opts.left if opts.left
|
44
45
|
right = transfuse.check_files opts.right if opts.right
|
@@ -54,6 +55,7 @@ else
|
|
54
55
|
abort msg
|
55
56
|
end
|
56
57
|
|
58
|
+
# filter out assemblies with low score
|
57
59
|
assembly_files = transfuse.filter assembly_files, scores
|
58
60
|
|
59
61
|
# concatenate assemblies into one fasta file
|
@@ -62,12 +64,11 @@ cat = transfuse.concatenate assembly_files
|
|
62
64
|
# load fasta sequences from concatenated file into hash
|
63
65
|
transfuse.load_fasta cat
|
64
66
|
|
65
|
-
# cluster using vsearch
|
66
|
-
|
67
|
-
|
68
|
-
transfuse.sequence_alignment clusters
|
69
|
-
# pull out contigs from each cluster based on the scores
|
70
|
-
# best = transfuse.select_contigs clusters, scores
|
67
|
+
# cluster using vsearch
|
68
|
+
msa = transfuse.cluster cat, opts.id
|
71
69
|
|
72
|
-
#
|
70
|
+
# read the msa from vsearch and produce a consensus fasta
|
71
|
+
cons = transfuse.consensus msa, scores, opts.output
|
73
72
|
|
73
|
+
# transrate the consensus output to remove low scoring contigs
|
74
|
+
transfuse.transrate_consensus cons, opts.output, left, right
|
data/deps/deps.yaml
CHANGED
@@ -0,0 +1,11 @@
|
|
1
|
+
vsearch:
|
2
|
+
binaries:
|
3
|
+
- vsearch
|
4
|
+
version:
|
5
|
+
number: '1.1.3'
|
6
|
+
command: 'vsearch --version'
|
7
|
+
url:
|
8
|
+
64bit:
|
9
|
+
linux: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-linux-x86_64
|
10
|
+
macosx: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-osx-x86_64
|
11
|
+
unpack: false
|
data/lib/transfuse/cluster.rb
CHANGED
@@ -5,61 +5,31 @@ module Transfuse
|
|
5
5
|
|
6
6
|
class Cluster
|
7
7
|
|
8
|
-
def initialize threads, verbose
|
9
|
-
@cdhit = Which::which('cd-hit-est').first
|
10
|
-
raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
|
8
|
+
def initialize threads, verbose, id
|
11
9
|
@vsearch = Which::which('vsearch').first
|
12
10
|
raise "vsearch was not in the PATH - please install it" unless @vsearch
|
13
|
-
@id =
|
11
|
+
@id = id.to_s
|
14
12
|
@threads = threads
|
15
13
|
@verbose = verbose
|
16
14
|
end
|
17
15
|
|
18
16
|
def run fasta
|
19
|
-
|
20
|
-
|
21
|
-
output = cd_hit fasta
|
22
|
-
return parse_output output
|
23
|
-
else
|
24
|
-
output = vsearch fasta
|
25
|
-
return parse_vsearch_output output
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def cd_hit fasta
|
30
|
-
puts "running cd-hit-est" if @verbose
|
31
|
-
output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
|
32
|
-
cdhit_cmd = generate_cdhit_command fasta, output
|
33
|
-
puts cdhit_cmd if @verbose
|
34
|
-
cluster = Cmd.new cdhit_cmd
|
35
|
-
cluster.run output
|
36
|
-
return "#{output}.clstr"
|
17
|
+
cluster_output, msa_output = vsearch fasta
|
18
|
+
return parse_vsearch_output(cluster_output, msa_output)
|
37
19
|
end
|
38
20
|
|
39
21
|
def vsearch fasta
|
40
|
-
|
41
|
-
cluster_output = "#{fasta}.clust"
|
42
|
-
|
22
|
+
print "running vsearch..." if @verbose
|
23
|
+
cluster_output = "#{File.basename(fasta)}-#{@id}.clust"
|
24
|
+
msa_output = "#{File.basename(fasta)}-#{@id}.aln"
|
25
|
+
vsearch_cmd = generate_vsearch_command fasta, cluster_output, msa_output
|
43
26
|
cluster = Cmd.new vsearch_cmd
|
44
27
|
cluster.run cluster_output
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
def generate_cdhit_command fasta, out
|
49
|
-
#cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
|
50
|
-
cmd = "#{@cdhit}"
|
51
|
-
cmd << " -i #{fasta}"
|
52
|
-
cmd << " -o #{out}"
|
53
|
-
cmd << " -c #{@id}" # similarity = number of identical bases /
|
54
|
-
# length of shorter sequences
|
55
|
-
cmd << " -T #{@threads}"
|
56
|
-
cmd << " -n 10" # word length - maybe increase??
|
57
|
-
cmd << " -d 100" # output name width
|
58
|
-
cmd << " -g 1" # slower but more accurate mode
|
59
|
-
cmd << " -M 8000" # increase memory
|
28
|
+
puts " Done. Created #{cluster_output}" if @verbose
|
29
|
+
return [cluster_output, msa_output]
|
60
30
|
end
|
61
31
|
|
62
|
-
def generate_vsearch_command fasta, out
|
32
|
+
def generate_vsearch_command fasta, out, msa
|
63
33
|
vsearch = "#{@vsearch}"
|
64
34
|
vsearch << " --cluster_fast #{fasta}"
|
65
35
|
vsearch << " --id #{@id}"
|
@@ -67,45 +37,60 @@ module Transfuse
|
|
67
37
|
vsearch << " --qmask none" # no masking
|
68
38
|
vsearch << " --strand both"
|
69
39
|
vsearch << " --uc #{out}"
|
40
|
+
vsearch << " --msaout #{msa}"
|
70
41
|
vsearch << " --threads #{@threads}"
|
71
42
|
return vsearch
|
72
43
|
end
|
73
44
|
|
74
|
-
def
|
75
|
-
|
76
|
-
cluster_id = 0
|
77
|
-
clusters = {}
|
78
|
-
File.open(cluster_output).each_line do |line|
|
79
|
-
if line =~ />Cluster\ ([0-9]+)/
|
80
|
-
cluster_id = $1.to_i
|
81
|
-
elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
|
82
|
-
contig_name = $1
|
83
|
-
strand = $2
|
84
|
-
id = $3.to_f
|
85
|
-
clusters[cluster_id] ||= []
|
86
|
-
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
87
|
-
elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
|
88
|
-
contig_name = $1
|
89
|
-
strand = "+"
|
90
|
-
clusters[cluster_id] ||= []
|
91
|
-
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
92
|
-
end
|
93
|
-
end
|
94
|
-
return clusters
|
95
|
-
end
|
96
|
-
|
97
|
-
def parse_vsearch_output cluster_output
|
45
|
+
def parse_vsearch_output cluster_output, msa_output
|
46
|
+
print "parsing vsearch output" if @verbose
|
98
47
|
clusters = {}
|
48
|
+
lookup = {}
|
49
|
+
second = 0
|
50
|
+
count = 0
|
99
51
|
File.open(cluster_output).each_line do |line|
|
52
|
+
count+=1
|
100
53
|
if line.start_with?("S") or line.start_with?("H")
|
101
54
|
cols = line.chomp.split("\t")
|
102
|
-
cluster = cols[1]
|
55
|
+
cluster = cols[1]
|
56
|
+
len = cols[2].to_i
|
57
|
+
cigar = cols[7]
|
58
|
+
strand = cols[4]
|
59
|
+
strand = "+" if strand == "*"
|
103
60
|
contig_name = cols[8]
|
61
|
+
|
104
62
|
clusters[cluster] ||= []
|
105
|
-
clusters[cluster] << contig_name
|
63
|
+
clusters[cluster] << { :name => contig_name, :strand => strand }
|
64
|
+
lookup[contig_name] = cluster
|
65
|
+
end
|
66
|
+
if count%10_000==0 and @verbose
|
67
|
+
print "."
|
106
68
|
end
|
107
69
|
end
|
108
|
-
|
70
|
+
puts " Done" if @verbose
|
71
|
+
print "parsing msa output " if @verbose
|
72
|
+
count = 0
|
73
|
+
msa = {}
|
74
|
+
Bio::FastaFormat.open(msa_output).each do |entry|
|
75
|
+
count += 1
|
76
|
+
name = entry.entry_id
|
77
|
+
if name != "consensus"
|
78
|
+
# name = name[1..-1]
|
79
|
+
if name[0]=="*"
|
80
|
+
name = name[1..-1]
|
81
|
+
end
|
82
|
+
# what cluster is name in?
|
83
|
+
cluster = lookup[name]
|
84
|
+
msa[cluster] ||= []
|
85
|
+
msa[cluster] << { :name => name, :seq => entry.seq.seq }
|
86
|
+
end
|
87
|
+
if count%10_000==0 and @verbose
|
88
|
+
print "."
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
puts " Done" if @verbose
|
93
|
+
return msa
|
109
94
|
end
|
110
95
|
|
111
96
|
end
|
data/lib/transfuse/cmd.rb
CHANGED
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
require 'bio'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Transfuse
|
6
|
+
|
7
|
+
class Consensus
|
8
|
+
|
9
|
+
attr_reader :contigs
|
10
|
+
|
11
|
+
def initialize verbose
|
12
|
+
@verbose = verbose
|
13
|
+
end
|
14
|
+
|
15
|
+
def run msa, scores, output
|
16
|
+
return 1 if File.exist?(output)
|
17
|
+
print "writing consensus " if @verbose
|
18
|
+
# msa is a hash
|
19
|
+
# key = cluster id
|
20
|
+
# value = list
|
21
|
+
# list of sequences in cluster aligned with gaps
|
22
|
+
preoutput = "#{File.basename(output, File.extname(output))}_cons.fa"
|
23
|
+
count = 0
|
24
|
+
File.open("#{output}.data", "w") do |out2|
|
25
|
+
File.open(preoutput, "w") do |out|
|
26
|
+
msa.each do |id, list|
|
27
|
+
count+=1
|
28
|
+
print "." if count%5_000==0 and @verbose
|
29
|
+
exons={}
|
30
|
+
cons = []
|
31
|
+
length = list[0][:seq].length
|
32
|
+
list.each_with_index do |hash, index|
|
33
|
+
seq = hash[:seq]
|
34
|
+
name = hash[:name]
|
35
|
+
out2.write "#{id}\t#{scores[name][:score]}\t#{name}\n"
|
36
|
+
prev = ""
|
37
|
+
gap = 0
|
38
|
+
exon = 0
|
39
|
+
seq.each_char do |c|
|
40
|
+
if c=="-"
|
41
|
+
base="-"
|
42
|
+
else
|
43
|
+
base="*"
|
44
|
+
end
|
45
|
+
if base!=prev
|
46
|
+
if c=="-"
|
47
|
+
gap+=1
|
48
|
+
else
|
49
|
+
exon+=1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
if c=="-"
|
53
|
+
prev = "-"
|
54
|
+
else
|
55
|
+
prev = "*"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
exons[index] = exon
|
59
|
+
end
|
60
|
+
|
61
|
+
consensus = ""
|
62
|
+
0.upto(length-1) do |i|
|
63
|
+
base="N"
|
64
|
+
counts = {}
|
65
|
+
list.each_with_index do |hash, index|
|
66
|
+
seq = hash[:seq]
|
67
|
+
if seq[i] != "-" and seq[i] != "N"
|
68
|
+
counts[seq[i]]||=0
|
69
|
+
counts[seq[i]] += 1
|
70
|
+
if exons[index]==1
|
71
|
+
base = seq[i]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
if counts.size>0
|
76
|
+
base = counts.sort.last.first
|
77
|
+
end
|
78
|
+
consensus << base
|
79
|
+
end
|
80
|
+
|
81
|
+
if consensus.count("N") < consensus.length.to_f*0.5
|
82
|
+
cons << consensus
|
83
|
+
end
|
84
|
+
|
85
|
+
list.each_with_index do |hash, index|
|
86
|
+
if exons[index] > 1
|
87
|
+
cons << hash[:seq].delete("-")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
cons.each_with_index do |s,index|
|
92
|
+
out.write ">contig#{id}.#{index+1}\n"
|
93
|
+
out.write "#{s}\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
end # msa.each
|
97
|
+
end # file
|
98
|
+
end # file open
|
99
|
+
puts " Done" if @verbose
|
100
|
+
return preoutput
|
101
|
+
end # def
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/transfuse/transfuse.rb
CHANGED
@@ -6,27 +6,28 @@ end
|
|
6
6
|
|
7
7
|
module Transfuse
|
8
8
|
|
9
|
+
require 'bio'
|
9
10
|
require 'csv'
|
10
11
|
require 'transrate'
|
12
|
+
require 'threach'
|
11
13
|
|
12
14
|
class Transfuse
|
13
15
|
|
14
16
|
def initialize threads, verbose
|
15
17
|
@threads = threads
|
16
18
|
@verbose = verbose
|
17
|
-
@clustalo = Which::which('clustalo').first
|
18
|
-
raise "clustalo was not in the PATH - please install it" unless @clustalo
|
19
19
|
end
|
20
20
|
|
21
21
|
def check_files string
|
22
|
+
# puts "check file string: #{string}" if @verbose
|
22
23
|
list = []
|
23
24
|
string.split(",").each do |file|
|
24
25
|
file = File.expand_path(file)
|
25
26
|
if File.exist?(file)
|
26
|
-
puts "#{file} exists" if @verbose
|
27
|
+
puts "#{File.basename(file)} exists" if @verbose
|
27
28
|
list << file
|
28
29
|
else
|
29
|
-
abort "#{file} not found"
|
30
|
+
abort "#{File.basename(file)} not found"
|
30
31
|
end
|
31
32
|
end
|
32
33
|
return list
|
@@ -51,42 +52,27 @@ module Transfuse
|
|
51
52
|
return File.expand_path(catted_fasta)
|
52
53
|
end
|
53
54
|
|
54
|
-
def cluster file
|
55
|
-
puts "clustering #{file}" if @verbose
|
56
|
-
cluster = Cluster.new @threads, @verbose
|
57
|
-
return cluster.run file
|
58
|
-
end
|
59
|
-
|
60
55
|
def load_fasta fasta
|
56
|
+
print "loading fasta sequence #{fasta}..." if @verbose
|
61
57
|
@sequences = {}
|
58
|
+
count = 1
|
62
59
|
Bio::FastaFormat.open(fasta).each do |entry|
|
63
60
|
@sequences[entry.entry_id] = entry.seq.to_s
|
61
|
+
print "." if count%10_000==0 and @verbose
|
62
|
+
count +=1
|
64
63
|
end
|
64
|
+
puts " Done" if @verbose
|
65
65
|
end
|
66
66
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
seq << "#{@sequences[hash[:name]].revcomp}\n"
|
77
|
-
else
|
78
|
-
abort "Unknown strand #{hash[:strand]}"
|
79
|
-
end
|
80
|
-
end
|
81
|
-
cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
|
82
|
-
cmd << "--output-order tree-order"
|
83
|
-
align = Cmd.new cmd
|
84
|
-
align.run
|
85
|
-
File.open("cluster#{id}.fa", "wb") do |out|
|
86
|
-
out.write align.stdout
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
67
|
+
def cluster file, id
|
68
|
+
puts "clustering #{file}" if @verbose
|
69
|
+
cluster = Cluster.new @threads, @verbose, id
|
70
|
+
return cluster.run file
|
71
|
+
end
|
72
|
+
|
73
|
+
def consensus msa, scores, output
|
74
|
+
cons = Consensus.new(@verbose)
|
75
|
+
return cons.run(msa, scores, output)
|
90
76
|
end
|
91
77
|
|
92
78
|
def load_scores files
|
@@ -96,8 +82,10 @@ module Transfuse
|
|
96
82
|
:header_converters => :symbol,
|
97
83
|
:converters => :all) do |row|
|
98
84
|
name = row[:contig_name]
|
99
|
-
|
100
|
-
|
85
|
+
scores[name] = { :score => row[:score].to_f,
|
86
|
+
:p_good => row[:p_good].to_f,
|
87
|
+
:p_bases_covered => row[:p_bases_covered].to_f,
|
88
|
+
:coverage => row[:coverage].to_f }
|
101
89
|
end
|
102
90
|
end
|
103
91
|
return scores
|
@@ -107,13 +95,15 @@ module Transfuse
|
|
107
95
|
filtered_files = []
|
108
96
|
files.each_with_index do |file, index|
|
109
97
|
new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
|
110
|
-
|
98
|
+
if !File.exist?(new_filename) or File.stat(new_filename).size < 1
|
111
99
|
File.open(new_filename, "wb") do |out|
|
112
|
-
puts "
|
100
|
+
puts "filtering #{file}..." if @verbose
|
113
101
|
Bio::FastaFormat.open(file).each do |entry|
|
114
102
|
contig_name = entry.entry_id
|
115
103
|
contig_name = "contig#{index}_#{contig_name}"
|
116
|
-
if scores.key?(contig_name) and
|
104
|
+
if scores.key?(contig_name) and
|
105
|
+
scores[contig_name][:score] > 0.01 and
|
106
|
+
scores[contig_name][:coverage] >= 1
|
117
107
|
out.write ">#{contig_name}\n"
|
118
108
|
out.write "#{entry.seq}\n"
|
119
109
|
elsif !scores.key?(contig_name)
|
@@ -127,75 +117,105 @@ module Transfuse
|
|
127
117
|
return filtered_files
|
128
118
|
end
|
129
119
|
|
120
|
+
def transrate_consensus file, output, left, right
|
121
|
+
output = File.expand_path(output)
|
122
|
+
puts "transrate on #{file}" if @verbose
|
123
|
+
file = File.expand_path(file)
|
124
|
+
name = File.basename(file, File.extname(file))
|
125
|
+
dir = "transrate_#{name}"
|
126
|
+
Dir.mkdir(dir) unless Dir.exist?(dir)
|
127
|
+
Dir.chdir(dir) do
|
128
|
+
assembly = Transrate::Assembly.new(file)
|
129
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
130
|
+
rename = "assembly_#{name}_score_optimisation.csv"
|
131
|
+
rm = transrater.read_metrics(left.join(','), right.join(','))
|
132
|
+
stats = rm.read_stats
|
133
|
+
File.rename("assembly_score_optimisation.csv", rename)
|
134
|
+
scores={}
|
135
|
+
assembly.each do |name, contig|
|
136
|
+
scores[name] = { :score => contig.score.to_f,
|
137
|
+
:p_good => contig.p_good.to_f,
|
138
|
+
:p_bases_covered => contig.p_bases_covered.to_f,
|
139
|
+
:coverage => contig.coverage.to_f }
|
140
|
+
end
|
141
|
+
scores_file = "#{name}_scores.csv"
|
142
|
+
stats_file = "../#{name}_stats.txt"
|
143
|
+
puts " writing scores" if @verbose
|
144
|
+
File.open(scores_file, "wb") do |out|
|
145
|
+
scores.each do |name, hash|
|
146
|
+
out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
|
147
|
+
out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
puts " writing filtered fasta file" if @verbose
|
151
|
+
File.open(output, "wb") do |out|
|
152
|
+
assembly.each do |name, contig|
|
153
|
+
if contig.score.to_f > 0.01 and contig.coverage.to_f >= 1
|
154
|
+
out.write ">#{name}\n"
|
155
|
+
out.write "#{contig.seq.seq}\n"
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
puts " writing stats" if @verbose
|
160
|
+
File.open(stats_file, "wb") do |out|
|
161
|
+
stats.each do |key, value|
|
162
|
+
out.write "#{key}\t#{value}\n"
|
163
|
+
end
|
164
|
+
out.write "assembly score:\t#{transrater.assembly_score}\n"
|
165
|
+
optimal = transrater.assembly_optimal_score("prefix")
|
166
|
+
out.write "optimal score :\t#{optimal[0]}\n"
|
167
|
+
out.write "cutoff :\t#{optimal[1]}\n"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
130
171
|
|
131
172
|
def transrate files, left, right
|
132
173
|
scores = {}
|
133
|
-
|
174
|
+
shortname = ""
|
175
|
+
files.each do |n|
|
176
|
+
shortname << File.basename(n, File.extname(n))[0..4]
|
177
|
+
end
|
178
|
+
scores_file = "#{shortname}_scores.csv"
|
134
179
|
if File.exist?(scores_file)
|
135
180
|
puts "loading scores from file" if @verbose
|
136
181
|
File.open(scores_file).each do |line|
|
137
|
-
name, score = line.chomp.split("\t")
|
138
|
-
scores[name] = score.to_f
|
182
|
+
name, score, p_good, p_bases_covered, coverage = line.chomp.split("\t")
|
183
|
+
scores[name] = { :score => score.to_f,
|
184
|
+
:p_good => p_good.to_f,
|
185
|
+
:p_bases_covered => p_bases_covered.to_f,
|
186
|
+
:coverage => coverage.to_f }
|
139
187
|
end
|
140
188
|
else
|
141
189
|
files.each_with_index do |fasta, index|
|
142
190
|
puts "transrate on #{fasta}" if @verbose
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
191
|
+
dir = "transrate_#{File.basename(fasta, File.extname(fasta))}"
|
192
|
+
Dir.mkdir(dir) unless Dir.exist?(dir)
|
193
|
+
Dir.chdir(dir) do
|
194
|
+
assembly = Transrate::Assembly.new(fasta)
|
195
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
196
|
+
rename = "assembly#{index}_score_optimisation.csv"
|
197
|
+
transrater.read_metrics(left.join(','), right.join(','))
|
198
|
+
File.rename("assembly_score_optimisation.csv", rename)
|
199
|
+
assembly.each do |name, contig|
|
200
|
+
name = "contig#{index}_#{name}"
|
201
|
+
scores[name] = { :score => contig.score.to_f,
|
202
|
+
:p_good => contig.p_good.to_f,
|
203
|
+
:p_bases_covered => contig.p_bases_covered.to_f,
|
204
|
+
:coverage => contig.coverage.to_f }
|
205
|
+
|
206
|
+
end
|
149
207
|
end
|
150
208
|
end
|
151
209
|
File.open(scores_file, "wb") do |out|
|
152
|
-
scores.each do |name,
|
153
|
-
out.write "#{name}\t#{score}\
|
210
|
+
scores.each do |name, hash|
|
211
|
+
out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
|
212
|
+
out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
|
154
213
|
end
|
155
214
|
end
|
156
215
|
end
|
157
216
|
return scores
|
158
217
|
end
|
159
218
|
|
160
|
-
def select_contigs clusters, scores
|
161
|
-
puts "selecting contigs" if @verbose
|
162
|
-
best = []
|
163
|
-
clusters.each do |cluster_id, list|
|
164
|
-
best_score = 0
|
165
|
-
best_contig = ""
|
166
|
-
list.each do |contig_name|
|
167
|
-
unless scores[contig_name]
|
168
|
-
abort "can't find #{contig_name} in scores hash\n"
|
169
|
-
end
|
170
|
-
if scores[contig_name] > best_score
|
171
|
-
best_score = scores[contig_name]
|
172
|
-
best_contig = contig_name
|
173
|
-
end
|
174
|
-
end
|
175
|
-
best << best_contig
|
176
|
-
end
|
177
|
-
return best
|
178
|
-
end
|
179
|
-
|
180
|
-
def output_contigs best, fasta, output
|
181
|
-
puts "writing contigs" if @verbose
|
182
|
-
# read in catted fasta sequences
|
183
|
-
sequences = {}
|
184
|
-
Bio::FastaFormat.open(fasta).each do |entry|
|
185
|
-
sequences[entry.entry_id] = entry.seq
|
186
|
-
end
|
187
|
-
File.open(output, "wb") do |out|
|
188
|
-
best.each do |contig_name|
|
189
|
-
if sequences.key?(contig_name)
|
190
|
-
out.write ">#{contig_name}\n"
|
191
|
-
out.write "#{sequences[contig_name]}\n"
|
192
|
-
else
|
193
|
-
puts "can't find #{contig_name} in #{fasta}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
219
|
end
|
200
220
|
|
201
221
|
end
|
data/lib/transfuse/version.rb
CHANGED
data/lib/transfuse.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
require 'transfuse/cluster
|
2
|
-
require 'transfuse/cmd
|
3
|
-
require 'transfuse/
|
4
|
-
require 'transfuse/
|
1
|
+
require 'transfuse/cluster'
|
2
|
+
require 'transfuse/cmd'
|
3
|
+
require 'transfuse/consensus'
|
4
|
+
require 'transfuse/transfuse'
|
5
|
+
require 'transfuse/version'
|
data/test/test_transfuse.rb
CHANGED
@@ -8,13 +8,13 @@ class TestTransfuse < Test::Unit::TestCase
|
|
8
8
|
context 'transfuse' do
|
9
9
|
|
10
10
|
setup do
|
11
|
-
@fuser = Transfuse::Transfuse.new 4
|
11
|
+
@fuser = Transfuse::Transfuse.new 4, true
|
12
12
|
end
|
13
13
|
|
14
14
|
teardown do
|
15
15
|
end
|
16
16
|
|
17
|
-
should 'check for existence of files' do
|
17
|
+
should '1 check for existence of files' do
|
18
18
|
list = []
|
19
19
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
20
20
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
|
@@ -22,7 +22,7 @@ class TestTransfuse < Test::Unit::TestCase
|
|
22
22
|
assert_equal 2, files.length, "length"
|
23
23
|
end
|
24
24
|
|
25
|
-
should "concatenate two files" do
|
25
|
+
should "2 concatenate two files" do
|
26
26
|
list = []
|
27
27
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
28
28
|
list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
|
@@ -36,71 +36,69 @@ class TestTransfuse < Test::Unit::TestCase
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
-
should "cluster fasta file" do
|
40
|
-
Dir.mktmpdir do |tmpdir|
|
39
|
+
should "3 cluster fasta file" do
|
40
|
+
# Dir.mktmpdir do |tmpdir|
|
41
|
+
tmpdir = Dir.mktmpdir
|
41
42
|
Dir.chdir(tmpdir) do
|
42
43
|
file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
43
44
|
hash = @fuser.cluster file
|
44
45
|
assert_equal 250, hash.size, "output size"
|
45
46
|
end
|
46
|
-
end
|
47
|
+
# end
|
47
48
|
end
|
48
49
|
|
49
|
-
should "load scores from transrate output" do
|
50
|
+
should "4 load scores from transrate output" do
|
50
51
|
files = []
|
51
52
|
files << File.join(File.dirname(__FILE__), 'data', 'contig_scores1.csv')
|
52
53
|
hash = @fuser.load_scores files
|
53
54
|
assert_equal 99, hash.size
|
54
55
|
end
|
55
56
|
|
56
|
-
should "
|
57
|
+
should "5 run transrate on assembly files with reads" do
|
57
58
|
files = []
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
59
|
+
left = []
|
60
|
+
right = []
|
61
|
+
files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
|
62
|
+
left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
|
63
|
+
right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
|
64
|
+
# Dir.mktmpdir do |tmpdir|
|
65
|
+
tmpdir = Dir.mktmpdir
|
66
|
+
Dir.chdir(tmpdir) do
|
67
|
+
scores = @fuser.transrate files, left, right
|
68
|
+
assert_equal 100, scores.size, "scores size"
|
69
|
+
end
|
70
|
+
# end
|
67
71
|
end
|
68
72
|
|
69
|
-
should "
|
73
|
+
should "6 filter contigs" do
|
70
74
|
files = []
|
71
75
|
left = []
|
72
76
|
right = []
|
73
|
-
files << File.join(File.dirname(__FILE__), 'data', '
|
77
|
+
files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
74
78
|
left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
|
75
79
|
right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
|
76
|
-
Dir.mktmpdir do |tmpdir|
|
80
|
+
# Dir.mktmpdir do |tmpdir|
|
81
|
+
tmpdir = Dir.mktmpdir
|
77
82
|
Dir.chdir(tmpdir) do
|
78
83
|
scores = @fuser.transrate files, left, right
|
79
|
-
|
84
|
+
scores.each do |contig, score|
|
85
|
+
# puts "#{contig}\t#{score}"
|
86
|
+
end
|
87
|
+
new_list = @fuser.filter files, scores
|
88
|
+
assert_equal 1, new_list.length
|
89
|
+
cmd = "grep -c \">\" #{new_list.first}"
|
90
|
+
assert_equal 1, `#{cmd}`.chomp.split.first.to_i, "number of contigs"
|
80
91
|
end
|
81
|
-
end
|
92
|
+
# end
|
93
|
+
|
82
94
|
end
|
83
95
|
|
84
|
-
should "
|
85
|
-
|
86
|
-
scores = { "contig1" => 0.2,
|
87
|
-
"contig2" => 0.3,
|
88
|
-
"contig3" => 0.4,
|
89
|
-
"contig4" => 0.2 }
|
90
|
-
best = @fuser.select_contigs clusters, scores
|
91
|
-
assert_equal 2, best.size
|
92
|
-
assert_equal "contig2", best[0]
|
93
|
-
assert_equal "contig3", best[1]
|
96
|
+
should "7 get consensus of clusters" do
|
97
|
+
|
94
98
|
end
|
95
99
|
|
96
|
-
should "
|
97
|
-
|
98
|
-
file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
|
99
|
-
Dir.mktmpdir do |tmpdir|
|
100
|
-
Dir.chdir(tmpdir) do
|
101
|
-
@fuser.output_contigs best, file, "out"
|
102
|
-
end
|
103
|
-
end
|
100
|
+
should "8 not fail when there are duplicated kmers in the input sequences" do
|
101
|
+
|
104
102
|
end
|
105
103
|
|
106
104
|
end
|
data/transfuse.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
|
20
20
|
gem.add_dependency 'fixwhich', '~> 1.0', '>= 1.0.2'
|
21
21
|
gem.add_dependency 'bindeps', '~> 1.0', '>= 1.0.1'
|
22
|
-
gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.
|
22
|
+
gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.1'
|
23
23
|
|
24
24
|
gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
25
25
|
gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: transfuse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Smith-Unna
|
@@ -94,7 +94,7 @@ dependencies:
|
|
94
94
|
version: '1.0'
|
95
95
|
- - ">="
|
96
96
|
- !ruby/object:Gem::Version
|
97
|
-
version: 1.0.
|
97
|
+
version: 1.0.1
|
98
98
|
type: :runtime
|
99
99
|
prerelease: false
|
100
100
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -104,7 +104,7 @@ dependencies:
|
|
104
104
|
version: '1.0'
|
105
105
|
- - ">="
|
106
106
|
- !ruby/object:Gem::Version
|
107
|
-
version: 1.0.
|
107
|
+
version: 1.0.1
|
108
108
|
- !ruby/object:Gem::Dependency
|
109
109
|
name: rake
|
110
110
|
requirement: !ruby/object:Gem::Requirement
|
@@ -210,7 +210,6 @@ extra_rdoc_files: []
|
|
210
210
|
files:
|
211
211
|
- ".gitignore"
|
212
212
|
- Gemfile
|
213
|
-
- Gemfile.lock
|
214
213
|
- README.md
|
215
214
|
- Rakefile
|
216
215
|
- bin/transfuse
|
@@ -218,6 +217,7 @@ files:
|
|
218
217
|
- lib/transfuse.rb
|
219
218
|
- lib/transfuse/cluster.rb
|
220
219
|
- lib/transfuse/cmd.rb
|
220
|
+
- lib/transfuse/consensus.rb
|
221
221
|
- lib/transfuse/transfuse.rb
|
222
222
|
- lib/transfuse/version.rb
|
223
223
|
- notes.md
|
@@ -248,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
248
248
|
version: '0'
|
249
249
|
requirements: []
|
250
250
|
rubyforge_project:
|
251
|
-
rubygems_version: 2.
|
251
|
+
rubygems_version: 2.2.2
|
252
252
|
signing_key:
|
253
253
|
specification_version: 4
|
254
254
|
summary: Merge assemblies
|
data/Gemfile.lock
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
transfuse (0.1.1)
|
5
|
-
bindeps (~> 1.0, >= 1.0.1)
|
6
|
-
bio (~> 1.4, >= 1.4.3)
|
7
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
8
|
-
transrate (= 1.0.0.beta3)
|
9
|
-
trollop (~> 2.0)
|
10
|
-
|
11
|
-
GEM
|
12
|
-
remote: https://rubygems.org/
|
13
|
-
specs:
|
14
|
-
ansi (1.5.0)
|
15
|
-
bindeps (1.1.2)
|
16
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
17
|
-
bio (1.4.3.0001)
|
18
|
-
coveralls (0.8.1)
|
19
|
-
json (~> 1.8)
|
20
|
-
rest-client (>= 1.6.8, < 2)
|
21
|
-
simplecov (~> 0.10.0)
|
22
|
-
term-ansicolor (~> 1.3)
|
23
|
-
thor (~> 0.19.1)
|
24
|
-
crb-blast (0.6.4)
|
25
|
-
bindeps (~> 1.0, >= 1.0.3)
|
26
|
-
bio (~> 1.4, >= 1.4.3)
|
27
|
-
fixwhich (~> 1.0, >= 1.0.2)
|
28
|
-
threach (~> 0.2, >= 0.2.0)
|
29
|
-
trollop (~> 2.0)
|
30
|
-
docile (1.1.5)
|
31
|
-
domain_name (0.5.24)
|
32
|
-
unf (>= 0.0.5, < 1.0.0)
|
33
|
-
facade (1.0.6)
|
34
|
-
fix-trinity-output (1.0.0)
|
35
|
-
trollop (~> 2.0)
|
36
|
-
fixwhich (1.0.2)
|
37
|
-
pathname2 (~> 1.4, >= 1.4.4)
|
38
|
-
http-cookie (1.0.2)
|
39
|
-
domain_name (~> 0.5)
|
40
|
-
json (1.8.3)
|
41
|
-
mime-types (2.6.1)
|
42
|
-
minitest (4.7.5)
|
43
|
-
netrc (0.10.3)
|
44
|
-
pathname2 (1.7.3)
|
45
|
-
facade
|
46
|
-
rake (10.4.2)
|
47
|
-
rest-client (1.8.0)
|
48
|
-
http-cookie (>= 1.0.2, < 2.0)
|
49
|
-
mime-types (>= 1.16, < 3.0)
|
50
|
-
netrc (~> 0.7)
|
51
|
-
shoulda-context (1.2.1)
|
52
|
-
simplecov (0.10.0)
|
53
|
-
docile (~> 1.1.0)
|
54
|
-
json (~> 1.8)
|
55
|
-
simplecov-html (~> 0.10.0)
|
56
|
-
simplecov-html (0.10.0)
|
57
|
-
term-ansicolor (1.3.0)
|
58
|
-
tins (~> 1.0)
|
59
|
-
thor (0.19.1)
|
60
|
-
threach (0.2.0)
|
61
|
-
tins (1.5.2)
|
62
|
-
transrate (1.0.0.beta3)
|
63
|
-
bindeps (~> 1.1, >= 1.1.2)
|
64
|
-
bio (~> 1.4, >= 1.4.3)
|
65
|
-
crb-blast (~> 0.5, >= 0.5.0)
|
66
|
-
fix-trinity-output (~> 1.0, >= 1.0)
|
67
|
-
trollop (~> 2.0, >= 2.0.0)
|
68
|
-
yell (~> 2.0, >= 2.0.4)
|
69
|
-
trollop (2.1.1)
|
70
|
-
turn (0.9.7)
|
71
|
-
ansi
|
72
|
-
minitest (~> 4)
|
73
|
-
unf (0.1.4)
|
74
|
-
unf_ext
|
75
|
-
unf_ext (0.0.7.1)
|
76
|
-
yell (2.0.5)
|
77
|
-
|
78
|
-
PLATFORMS
|
79
|
-
ruby
|
80
|
-
|
81
|
-
DEPENDENCIES
|
82
|
-
coveralls (~> 0.7)
|
83
|
-
rake (~> 10.3, >= 10.3.2)
|
84
|
-
shoulda-context (~> 1.2, >= 1.2.1)
|
85
|
-
simplecov (~> 0.8, >= 0.8.2)
|
86
|
-
transfuse!
|
87
|
-
turn (~> 0.9, >= 0.9.7)
|