transfuse 0.1.4 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5b17813bba553fa78a7e3cf24366981e368ebfe
4
- data.tar.gz: f57a5011b3296862518ce54a7f3d4bb928e05aab
3
+ metadata.gz: 03b84ea2236b7c43ce321f64352bb6cb7b7035d4
4
+ data.tar.gz: ed70d9a9c204a06b550a7b95c01c74afdcb98b51
5
5
  SHA512:
6
- metadata.gz: 52af21dbf0c5b49aa746d515a214229bccd78a55d5998ddc929b2594d8abe5bafcb9cd1bbc98ec6ea1a41a5cb3240666616837d3d06c5cbacd25acca3fe392ca
7
- data.tar.gz: 147cc022f22ad319ff0651a6092ced5d3f1c7e99ab3989a8091970ab5c25a97e42d9b98f7384e28712ef802c4ced8090c3942ba66105963013eb30e46c3b58cb
6
+ metadata.gz: 6074f06d8afd9e33fce7b77e0221ffc8a300da966cc7916f86eb245d02c1e2b5519a831e1c99962fcfd5b085b5cd3d94765297ec94469270e4246c6c9ab71830
7
+ data.tar.gz: d992bac7cb1a98cd523e4df28d5b684f42bf8d8d7deb7882d42ecd8bb50bb137a190200006c7528f834d9e30a214b1f304a46b8502176993e22b019658c19061
data/README.md CHANGED
@@ -1 +1,42 @@
1
- Transfuse
1
+ ## Transfuse
2
+
3
+ **Transfuse is currently in development and is not yet ready for use**
4
+
5
+ Transfuse intelligently merges your multiple de novo transcriptome assemblies. Run multiple assemblies with different de novo assemblers, or different settings in the same assembler and have them combined into a single high quality transcriptome.
6
+
7
+ Transfuse takes in the reads you used to do the assembly and a list of fasta files and produces a single output fasta file.
8
+
9
+ ### Installation and Running
10
+
11
+ To install Transfuse, clone this repo:
12
+
13
+ `git clone https://github.com/cboursnell/transfuse.git`
14
+
15
+ Then build and install the ruby gem
16
+
17
+ `gem build *spec; gem install *gem`
18
+
19
+ ### Usage
20
+
21
+ Transfuse is run on the command line. The options are:
22
+
23
+ ```
24
+ -a, --assembly=<s> assembly files in FASTA format, comma-separated
25
+ -l, --left=<s> left reads file in FASTQ format
26
+ -r, --right=<s> right reads file in FASTQ format
27
+ -o, --output=<s> write merged assembly to file
28
+ -t, --threads=<i> number of threads (default: 1)
29
+ -v, --verbose be verbose
30
+ -e, --version Print version and exit
31
+ -h, --help Show this message
32
+ ```
33
+
34
+ An example command:
35
+
36
+ `transfuse --assembly soap-k31.fa,soap-k41.fa,soap-k51.fa --left reads_1.fq --right reads_2.fq --output soap-merged.fa --threads 12`
37
+
38
+ ### License
39
+
40
+ This is adademic software - please cite us if you use it in your work.
41
+
42
+ Transfuse is released under the MIT license.
data/Rakefile CHANGED
@@ -5,15 +5,21 @@ Rake::TestTask.new do |t|
5
5
  end
6
6
 
7
7
  Rake::TestTask.new do |t|
8
- t.name = :corset
8
+ t.name = :cluster
9
9
  t.libs << 'test'
10
- t.test_files = ['test/test_corset.rb']
10
+ t.test_files = ['test/test_cluster.rb']
11
11
  end
12
12
 
13
13
  Rake::TestTask.new do |t|
14
- t.name = :cluster
14
+ t.name = :fuse
15
15
  t.libs << 'test'
16
- t.test_files = ['test/test_cluster.rb']
16
+ t.test_files = ['test/test_transfuse.rb']
17
+ end
18
+
19
+ Rake::TestTask.new do |t|
20
+ t.name = :cons
21
+ t.libs << 'test'
22
+ t.test_files = ['test/test_consensus.rb']
17
23
  end
18
24
 
19
25
  desc "Run tests"
data/bin/transfuse CHANGED
@@ -22,23 +22,24 @@ opts = Trollop::options do
22
22
  OPTIONS:
23
23
 
24
24
  EOS
25
- opt :assembly, "assembly files in FASTA format, comma-separated",
25
+ opt :assemblies, "assembly files in FASTA format, comma-separated",
26
26
  :type => String, :required => true
27
- opt :scores, "transrate contig score output files, comma-separated",
28
- :type => String
29
27
  opt :left, "left reads file in FASTQ format",
30
28
  :type => String
31
29
  opt :right, "right reads file in FASTQ format",
32
30
  :type => String
31
+ opt :scores, "transrate contig score output files, comma-separated. Ignored if reads are provided",
32
+ :type => String
33
33
  opt :output, "write merged assembly to file",
34
34
  :type => String, :required => :true
35
35
  opt :threads, "number of threads", :type => :int, :default => 1
36
+ opt :id, "sequence identity to cluster at", :type => :float, :default => 1.0
36
37
  opt :verbose, "be verbose"
37
38
  end
38
39
 
39
40
  transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
40
41
 
41
- assembly_files = transfuse.check_files opts.assembly
42
+ assembly_files = transfuse.check_files opts.assemblies
42
43
  score_files = transfuse.check_files opts.score if opts.score
43
44
  left = transfuse.check_files opts.left if opts.left
44
45
  right = transfuse.check_files opts.right if opts.right
@@ -54,6 +55,7 @@ else
54
55
  abort msg
55
56
  end
56
57
 
58
+ # filter out assemblies with low score
57
59
  assembly_files = transfuse.filter assembly_files, scores
58
60
 
59
61
  # concatenate assemblies into one fasta file
@@ -62,12 +64,11 @@ cat = transfuse.concatenate assembly_files
62
64
  # load fasta sequences from concatenated file into hash
63
65
  transfuse.load_fasta cat
64
66
 
65
- # cluster using vsearch or maybe cd-hit-est
66
- clusters = transfuse.cluster cat
67
-
68
- transfuse.sequence_alignment clusters
69
- # pull out contigs from each cluster based on the scores
70
- # best = transfuse.select_contigs clusters, scores
67
+ # cluster using vsearch
68
+ msa = transfuse.cluster cat, opts.id
71
69
 
72
- # transfuse.output_contigs best, cat, opts.output
70
+ # read the msa from vsearch and produce a consensus fasta
71
+ cons = transfuse.consensus msa, scores, opts.output
73
72
 
73
+ # transrate the consensus output to remove low scoring contigs
74
+ transfuse.transrate_consensus cons, opts.output, left, right
data/deps/deps.yaml CHANGED
@@ -0,0 +1,11 @@
1
+ vsearch:
2
+ binaries:
3
+ - vsearch
4
+ version:
5
+ number: '1.1.3'
6
+ command: 'vsearch --version'
7
+ url:
8
+ 64bit:
9
+ linux: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-linux-x86_64
10
+ macosx: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-osx-x86_64
11
+ unpack: false
@@ -5,61 +5,31 @@ module Transfuse
5
5
 
6
6
  class Cluster
7
7
 
8
- def initialize threads, verbose
9
- @cdhit = Which::which('cd-hit-est').first
10
- raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
8
+ def initialize threads, verbose, id
11
9
  @vsearch = Which::which('vsearch').first
12
10
  raise "vsearch was not in the PATH - please install it" unless @vsearch
13
- @id = "1.00"
11
+ @id = id.to_s
14
12
  @threads = threads
15
13
  @verbose = verbose
16
14
  end
17
15
 
18
16
  def run fasta
19
- use_cd_hit = false
20
- if use_cd_hit
21
- output = cd_hit fasta
22
- return parse_output output
23
- else
24
- output = vsearch fasta
25
- return parse_vsearch_output output
26
- end
27
- end
28
-
29
- def cd_hit fasta
30
- puts "running cd-hit-est" if @verbose
31
- output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
32
- cdhit_cmd = generate_cdhit_command fasta, output
33
- puts cdhit_cmd if @verbose
34
- cluster = Cmd.new cdhit_cmd
35
- cluster.run output
36
- return "#{output}.clstr"
17
+ cluster_output, msa_output = vsearch fasta
18
+ return parse_vsearch_output(cluster_output, msa_output)
37
19
  end
38
20
 
39
21
  def vsearch fasta
40
- puts "running vsearch" if @verbose
41
- cluster_output = "#{fasta}.clust"
42
- vsearch_cmd = generate_vsearch_command fasta, cluster_output
22
+ print "running vsearch..." if @verbose
23
+ cluster_output = "#{File.basename(fasta)}-#{@id}.clust"
24
+ msa_output = "#{File.basename(fasta)}-#{@id}.aln"
25
+ vsearch_cmd = generate_vsearch_command fasta, cluster_output, msa_output
43
26
  cluster = Cmd.new vsearch_cmd
44
27
  cluster.run cluster_output
45
- return cluster_output
46
- end
47
-
48
- def generate_cdhit_command fasta, out
49
- #cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
50
- cmd = "#{@cdhit}"
51
- cmd << " -i #{fasta}"
52
- cmd << " -o #{out}"
53
- cmd << " -c #{@id}" # similarity = number of identical bases /
54
- # length of shorter sequences
55
- cmd << " -T #{@threads}"
56
- cmd << " -n 10" # word length - maybe increase??
57
- cmd << " -d 100" # output name width
58
- cmd << " -g 1" # slower but more accurate mode
59
- cmd << " -M 8000" # increase memory
28
+ puts " Done. Created #{cluster_output}" if @verbose
29
+ return [cluster_output, msa_output]
60
30
  end
61
31
 
62
- def generate_vsearch_command fasta, out
32
+ def generate_vsearch_command fasta, out, msa
63
33
  vsearch = "#{@vsearch}"
64
34
  vsearch << " --cluster_fast #{fasta}"
65
35
  vsearch << " --id #{@id}"
@@ -67,45 +37,60 @@ module Transfuse
67
37
  vsearch << " --qmask none" # no masking
68
38
  vsearch << " --strand both"
69
39
  vsearch << " --uc #{out}"
40
+ vsearch << " --msaout #{msa}"
70
41
  vsearch << " --threads #{@threads}"
71
42
  return vsearch
72
43
  end
73
44
 
74
- def parse_output cluster_output
75
- puts "parsing cd-hit output #{cluster_output}" if @verbose
76
- cluster_id = 0
77
- clusters = {}
78
- File.open(cluster_output).each_line do |line|
79
- if line =~ />Cluster\ ([0-9]+)/
80
- cluster_id = $1.to_i
81
- elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
82
- contig_name = $1
83
- strand = $2
84
- id = $3.to_f
85
- clusters[cluster_id] ||= []
86
- clusters[cluster_id] << { :name => contig_name, :strand => strand }
87
- elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
88
- contig_name = $1
89
- strand = "+"
90
- clusters[cluster_id] ||= []
91
- clusters[cluster_id] << { :name => contig_name, :strand => strand }
92
- end
93
- end
94
- return clusters
95
- end
96
-
97
- def parse_vsearch_output cluster_output
45
+ def parse_vsearch_output cluster_output, msa_output
46
+ print "parsing vsearch output" if @verbose
98
47
  clusters = {}
48
+ lookup = {}
49
+ second = 0
50
+ count = 0
99
51
  File.open(cluster_output).each_line do |line|
52
+ count+=1
100
53
  if line.start_with?("S") or line.start_with?("H")
101
54
  cols = line.chomp.split("\t")
102
- cluster = cols[1].to_i
55
+ cluster = cols[1]
56
+ len = cols[2].to_i
57
+ cigar = cols[7]
58
+ strand = cols[4]
59
+ strand = "+" if strand == "*"
103
60
  contig_name = cols[8]
61
+
104
62
  clusters[cluster] ||= []
105
- clusters[cluster] << contig_name
63
+ clusters[cluster] << { :name => contig_name, :strand => strand }
64
+ lookup[contig_name] = cluster
65
+ end
66
+ if count%10_000==0 and @verbose
67
+ print "."
106
68
  end
107
69
  end
108
- return clusters
70
+ puts " Done" if @verbose
71
+ print "parsing msa output " if @verbose
72
+ count = 0
73
+ msa = {}
74
+ Bio::FastaFormat.open(msa_output).each do |entry|
75
+ count += 1
76
+ name = entry.entry_id
77
+ if name != "consensus"
78
+ # name = name[1..-1]
79
+ if name[0]=="*"
80
+ name = name[1..-1]
81
+ end
82
+ # what cluster is name in?
83
+ cluster = lookup[name]
84
+ msa[cluster] ||= []
85
+ msa[cluster] << { :name => name, :seq => entry.seq.seq }
86
+ end
87
+ if count%10_000==0 and @verbose
88
+ print "."
89
+ end
90
+
91
+ end
92
+ puts " Done" if @verbose
93
+ return msa
109
94
  end
110
95
 
111
96
  end
data/lib/transfuse/cmd.rb CHANGED
@@ -18,7 +18,7 @@ module Transfuse
18
18
 
19
19
  def run file=nil
20
20
  unless file.nil?
21
- if File.exist?(file)
21
+ if File.exist?(file) and File.stat(file).size > 0
22
22
  @stdout = ""
23
23
  @stderr = ""
24
24
  @status = Status.new
@@ -0,0 +1,105 @@
1
+
2
+ require 'bio'
3
+ require 'set'
4
+
5
+ module Transfuse
6
+
7
+ class Consensus
8
+
9
+ attr_reader :contigs
10
+
11
+ def initialize verbose
12
+ @verbose = verbose
13
+ end
14
+
15
+ def run msa, scores, output
16
+ return 1 if File.exist?(output)
17
+ print "writing consensus " if @verbose
18
+ # msa is a hash
19
+ # key = cluster id
20
+ # value = list
21
+ # list of sequences in cluster aligned with gaps
22
+ preoutput = "#{File.basename(output, File.extname(output))}_cons.fa"
23
+ count = 0
24
+ File.open("#{output}.data", "w") do |out2|
25
+ File.open(preoutput, "w") do |out|
26
+ msa.each do |id, list|
27
+ count+=1
28
+ print "." if count%5_000==0 and @verbose
29
+ exons={}
30
+ cons = []
31
+ length = list[0][:seq].length
32
+ list.each_with_index do |hash, index|
33
+ seq = hash[:seq]
34
+ name = hash[:name]
35
+ out2.write "#{id}\t#{scores[name][:score]}\t#{name}\n"
36
+ prev = ""
37
+ gap = 0
38
+ exon = 0
39
+ seq.each_char do |c|
40
+ if c=="-"
41
+ base="-"
42
+ else
43
+ base="*"
44
+ end
45
+ if base!=prev
46
+ if c=="-"
47
+ gap+=1
48
+ else
49
+ exon+=1
50
+ end
51
+ end
52
+ if c=="-"
53
+ prev = "-"
54
+ else
55
+ prev = "*"
56
+ end
57
+ end
58
+ exons[index] = exon
59
+ end
60
+
61
+ consensus = ""
62
+ 0.upto(length-1) do |i|
63
+ base="N"
64
+ counts = {}
65
+ list.each_with_index do |hash, index|
66
+ seq = hash[:seq]
67
+ if seq[i] != "-" and seq[i] != "N"
68
+ counts[seq[i]]||=0
69
+ counts[seq[i]] += 1
70
+ if exons[index]==1
71
+ base = seq[i]
72
+ end
73
+ end
74
+ end
75
+ if counts.size>0
76
+ base = counts.sort.last.first
77
+ end
78
+ consensus << base
79
+ end
80
+
81
+ if consensus.count("N") < consensus.length.to_f*0.5
82
+ cons << consensus
83
+ end
84
+
85
+ list.each_with_index do |hash, index|
86
+ if exons[index] > 1
87
+ cons << hash[:seq].delete("-")
88
+ end
89
+ end
90
+
91
+ cons.each_with_index do |s,index|
92
+ out.write ">contig#{id}.#{index+1}\n"
93
+ out.write "#{s}\n"
94
+ end
95
+
96
+ end # msa.each
97
+ end # file
98
+ end # file open
99
+ puts " Done" if @verbose
100
+ return preoutput
101
+ end # def
102
+
103
+ end
104
+
105
+ end
@@ -6,27 +6,28 @@ end
6
6
 
7
7
  module Transfuse
8
8
 
9
+ require 'bio'
9
10
  require 'csv'
10
11
  require 'transrate'
12
+ require 'threach'
11
13
 
12
14
  class Transfuse
13
15
 
14
16
  def initialize threads, verbose
15
17
  @threads = threads
16
18
  @verbose = verbose
17
- @clustalo = Which::which('clustalo').first
18
- raise "clustalo was not in the PATH - please install it" unless @clustalo
19
19
  end
20
20
 
21
21
  def check_files string
22
+ # puts "check file string: #{string}" if @verbose
22
23
  list = []
23
24
  string.split(",").each do |file|
24
25
  file = File.expand_path(file)
25
26
  if File.exist?(file)
26
- puts "#{file} exists" if @verbose
27
+ puts "#{File.basename(file)} exists" if @verbose
27
28
  list << file
28
29
  else
29
- abort "#{file} not found"
30
+ abort "#{File.basename(file)} not found"
30
31
  end
31
32
  end
32
33
  return list
@@ -51,42 +52,27 @@ module Transfuse
51
52
  return File.expand_path(catted_fasta)
52
53
  end
53
54
 
54
- def cluster file
55
- puts "clustering #{file}" if @verbose
56
- cluster = Cluster.new @threads, @verbose
57
- return cluster.run file
58
- end
59
-
60
55
  def load_fasta fasta
56
+ print "loading fasta sequence #{fasta}..." if @verbose
61
57
  @sequences = {}
58
+ count = 1
62
59
  Bio::FastaFormat.open(fasta).each do |entry|
63
60
  @sequences[entry.entry_id] = entry.seq.to_s
61
+ print "." if count%10_000==0 and @verbose
62
+ count +=1
64
63
  end
64
+ puts " Done" if @verbose
65
65
  end
66
66
 
67
- def sequence_alignment clusters
68
- clusters.each do |id, list| # threach
69
- if list.size > 5
70
- seq = ""
71
- list.each do |hash|
72
- seq << ">#{hash[:name]}\n"
73
- if hash[:strand] == "+"
74
- seq << "#{@sequences[hash[:name]]}\n"
75
- elsif hash[:strand] == "-"
76
- seq << "#{@sequences[hash[:name]].revcomp}\n"
77
- else
78
- abort "Unknown strand #{hash[:strand]}"
79
- end
80
- end
81
- cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
82
- cmd << "--output-order tree-order"
83
- align = Cmd.new cmd
84
- align.run
85
- File.open("cluster#{id}.fa", "wb") do |out|
86
- out.write align.stdout
87
- end
88
- end
89
- end
67
+ def cluster file, id
68
+ puts "clustering #{file}" if @verbose
69
+ cluster = Cluster.new @threads, @verbose, id
70
+ return cluster.run file
71
+ end
72
+
73
+ def consensus msa, scores, output
74
+ cons = Consensus.new(@verbose)
75
+ return cons.run(msa, scores, output)
90
76
  end
91
77
 
92
78
  def load_scores files
@@ -96,8 +82,10 @@ module Transfuse
96
82
  :header_converters => :symbol,
97
83
  :converters => :all) do |row|
98
84
  name = row[:contig_name]
99
- score = row[:score]
100
- scores[name] = score
85
+ scores[name] = { :score => row[:score].to_f,
86
+ :p_good => row[:p_good].to_f,
87
+ :p_bases_covered => row[:p_bases_covered].to_f,
88
+ :coverage => row[:coverage].to_f }
101
89
  end
102
90
  end
103
91
  return scores
@@ -107,13 +95,15 @@ module Transfuse
107
95
  filtered_files = []
108
96
  files.each_with_index do |file, index|
109
97
  new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
110
- unless File.exist?(new_filename)
98
+ if !File.exist?(new_filename) or File.stat(new_filename).size < 1
111
99
  File.open(new_filename, "wb") do |out|
112
- puts "opening #{file}..."
100
+ puts "filtering #{file}..." if @verbose
113
101
  Bio::FastaFormat.open(file).each do |entry|
114
102
  contig_name = entry.entry_id
115
103
  contig_name = "contig#{index}_#{contig_name}"
116
- if scores.key?(contig_name) and scores[contig_name] > 0.01
104
+ if scores.key?(contig_name) and
105
+ scores[contig_name][:score] > 0.01 and
106
+ scores[contig_name][:coverage] >= 1
117
107
  out.write ">#{contig_name}\n"
118
108
  out.write "#{entry.seq}\n"
119
109
  elsif !scores.key?(contig_name)
@@ -127,75 +117,105 @@ module Transfuse
127
117
  return filtered_files
128
118
  end
129
119
 
120
+ def transrate_consensus file, output, left, right
121
+ output = File.expand_path(output)
122
+ puts "transrate on #{file}" if @verbose
123
+ file = File.expand_path(file)
124
+ name = File.basename(file, File.extname(file))
125
+ dir = "transrate_#{name}"
126
+ Dir.mkdir(dir) unless Dir.exist?(dir)
127
+ Dir.chdir(dir) do
128
+ assembly = Transrate::Assembly.new(file)
129
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
130
+ rename = "assembly_#{name}_score_optimisation.csv"
131
+ rm = transrater.read_metrics(left.join(','), right.join(','))
132
+ stats = rm.read_stats
133
+ File.rename("assembly_score_optimisation.csv", rename)
134
+ scores={}
135
+ assembly.each do |name, contig|
136
+ scores[name] = { :score => contig.score.to_f,
137
+ :p_good => contig.p_good.to_f,
138
+ :p_bases_covered => contig.p_bases_covered.to_f,
139
+ :coverage => contig.coverage.to_f }
140
+ end
141
+ scores_file = "#{name}_scores.csv"
142
+ stats_file = "../#{name}_stats.txt"
143
+ puts " writing scores" if @verbose
144
+ File.open(scores_file, "wb") do |out|
145
+ scores.each do |name, hash|
146
+ out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
147
+ out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
148
+ end
149
+ end
150
+ puts " writing filtered fasta file" if @verbose
151
+ File.open(output, "wb") do |out|
152
+ assembly.each do |name, contig|
153
+ if contig.score.to_f > 0.01 and contig.coverage.to_f >= 1
154
+ out.write ">#{name}\n"
155
+ out.write "#{contig.seq.seq}\n"
156
+ end
157
+ end
158
+ end
159
+ puts " writing stats" if @verbose
160
+ File.open(stats_file, "wb") do |out|
161
+ stats.each do |key, value|
162
+ out.write "#{key}\t#{value}\n"
163
+ end
164
+ out.write "assembly score:\t#{transrater.assembly_score}\n"
165
+ optimal = transrater.assembly_optimal_score("prefix")
166
+ out.write "optimal score :\t#{optimal[0]}\n"
167
+ out.write "cutoff :\t#{optimal[1]}\n"
168
+ end
169
+ end
170
+ end
130
171
 
131
172
  def transrate files, left, right
132
173
  scores = {}
133
- scores_file = "scores.csv"
174
+ shortname = ""
175
+ files.each do |n|
176
+ shortname << File.basename(n, File.extname(n))[0..4]
177
+ end
178
+ scores_file = "#{shortname}_scores.csv"
134
179
  if File.exist?(scores_file)
135
180
  puts "loading scores from file" if @verbose
136
181
  File.open(scores_file).each do |line|
137
- name, score = line.chomp.split("\t")
138
- scores[name] = score.to_f
182
+ name, score, p_good, p_bases_covered, coverage = line.chomp.split("\t")
183
+ scores[name] = { :score => score.to_f,
184
+ :p_good => p_good.to_f,
185
+ :p_bases_covered => p_bases_covered.to_f,
186
+ :coverage => coverage.to_f }
139
187
  end
140
188
  else
141
189
  files.each_with_index do |fasta, index|
142
190
  puts "transrate on #{fasta}" if @verbose
143
- assembly = Transrate::Assembly.new(fasta)
144
- transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
145
- transrater.read_metrics(left.join(','), right.join(','))
146
- assembly.each do |name, contig|
147
- name = "contig#{index}_#{name}"
148
- scores[name] = contig.score
191
+ dir = "transrate_#{File.basename(fasta, File.extname(fasta))}"
192
+ Dir.mkdir(dir) unless Dir.exist?(dir)
193
+ Dir.chdir(dir) do
194
+ assembly = Transrate::Assembly.new(fasta)
195
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
196
+ rename = "assembly#{index}_score_optimisation.csv"
197
+ transrater.read_metrics(left.join(','), right.join(','))
198
+ File.rename("assembly_score_optimisation.csv", rename)
199
+ assembly.each do |name, contig|
200
+ name = "contig#{index}_#{name}"
201
+ scores[name] = { :score => contig.score.to_f,
202
+ :p_good => contig.p_good.to_f,
203
+ :p_bases_covered => contig.p_bases_covered.to_f,
204
+ :coverage => contig.coverage.to_f }
205
+
206
+ end
149
207
  end
150
208
  end
151
209
  File.open(scores_file, "wb") do |out|
152
- scores.each do |name, score|
153
- out.write "#{name}\t#{score}\n"
210
+ scores.each do |name, hash|
211
+ out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
212
+ out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
154
213
  end
155
214
  end
156
215
  end
157
216
  return scores
158
217
  end
159
218
 
160
- def select_contigs clusters, scores
161
- puts "selecting contigs" if @verbose
162
- best = []
163
- clusters.each do |cluster_id, list|
164
- best_score = 0
165
- best_contig = ""
166
- list.each do |contig_name|
167
- unless scores[contig_name]
168
- abort "can't find #{contig_name} in scores hash\n"
169
- end
170
- if scores[contig_name] > best_score
171
- best_score = scores[contig_name]
172
- best_contig = contig_name
173
- end
174
- end
175
- best << best_contig
176
- end
177
- return best
178
- end
179
-
180
- def output_contigs best, fasta, output
181
- puts "writing contigs" if @verbose
182
- # read in catted fasta sequences
183
- sequences = {}
184
- Bio::FastaFormat.open(fasta).each do |entry|
185
- sequences[entry.entry_id] = entry.seq
186
- end
187
- File.open(output, "wb") do |out|
188
- best.each do |contig_name|
189
- if sequences.key?(contig_name)
190
- out.write ">#{contig_name}\n"
191
- out.write "#{sequences[contig_name]}\n"
192
- else
193
- puts "can't find #{contig_name} in #{fasta}"
194
- end
195
- end
196
- end
197
- end
198
-
199
219
  end
200
220
 
201
221
  end
@@ -7,8 +7,8 @@ module Transfuse
7
7
  # Semantic Versioning 2.0 (http://semver.org/).
8
8
  module VERSION
9
9
  MAJOR = 0
10
- MINOR = 1
11
- PATCH = 4
10
+ MINOR = 4
11
+ PATCH = 2
12
12
  BUILD = nil
13
13
 
14
14
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
data/lib/transfuse.rb CHANGED
@@ -1,4 +1,5 @@
1
- require 'transfuse/cluster.rb'
2
- require 'transfuse/cmd.rb'
3
- require 'transfuse/transfuse.rb'
4
- require 'transfuse/version.rb'
1
+ require 'transfuse/cluster'
2
+ require 'transfuse/cmd'
3
+ require 'transfuse/consensus'
4
+ require 'transfuse/transfuse'
5
+ require 'transfuse/version'
@@ -8,13 +8,13 @@ class TestTransfuse < Test::Unit::TestCase
8
8
  context 'transfuse' do
9
9
 
10
10
  setup do
11
- @fuser = Transfuse::Transfuse.new 4
11
+ @fuser = Transfuse::Transfuse.new 4, true
12
12
  end
13
13
 
14
14
  teardown do
15
15
  end
16
16
 
17
- should 'check for existence of files' do
17
+ should '1 check for existence of files' do
18
18
  list = []
19
19
  list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
20
20
  list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -22,7 +22,7 @@ class TestTransfuse < Test::Unit::TestCase
22
22
  assert_equal 2, files.length, "length"
23
23
  end
24
24
 
25
- should "concatenate two files" do
25
+ should "2 concatenate two files" do
26
26
  list = []
27
27
  list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
28
28
  list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -36,71 +36,69 @@ class TestTransfuse < Test::Unit::TestCase
36
36
  end
37
37
  end
38
38
 
39
- should "cluster fasta file" do
40
- Dir.mktmpdir do |tmpdir|
39
+ should "3 cluster fasta file" do
40
+ # Dir.mktmpdir do |tmpdir|
41
+ tmpdir = Dir.mktmpdir
41
42
  Dir.chdir(tmpdir) do
42
43
  file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
43
44
  hash = @fuser.cluster file
44
45
  assert_equal 250, hash.size, "output size"
45
46
  end
46
- end
47
+ # end
47
48
  end
48
49
 
49
- should "load scores from transrate output" do
50
+ should "4 load scores from transrate output" do
50
51
  files = []
51
52
  files << File.join(File.dirname(__FILE__), 'data', 'contig_scores1.csv')
52
53
  hash = @fuser.load_scores files
53
54
  assert_equal 99, hash.size
54
55
  end
55
56
 
56
- should "filter contigs" do
57
+ should "5 run transrate on assembly files with reads" do
57
58
  files = []
58
- scores = {}
59
- files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
60
- scores["soap_contig173359"] = 1
61
- scores["soap_contig38533"] = 0.5
62
- scores["idba_contig44716"] = 0
63
- new_list = @fuser.filter files, scores
64
- assert_equal 1, new_list.length
65
- cmd = "grep -c \">\" #{new_list.first}"
66
- assert_equal 2, `#{cmd}`.chomp.split.first.to_i, "contigs"
59
+ left = []
60
+ right = []
61
+ files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
62
+ left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
63
+ right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
64
+ # Dir.mktmpdir do |tmpdir|
65
+ tmpdir = Dir.mktmpdir
66
+ Dir.chdir(tmpdir) do
67
+ scores = @fuser.transrate files, left, right
68
+ assert_equal 100, scores.size, "scores size"
69
+ end
70
+ # end
67
71
  end
68
72
 
69
- should "run transrate on assembly files with reads" do
73
+ should "6 filter contigs" do
70
74
  files = []
71
75
  left = []
72
76
  right = []
73
- files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
77
+ files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
74
78
  left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
75
79
  right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
76
- Dir.mktmpdir do |tmpdir|
80
+ # Dir.mktmpdir do |tmpdir|
81
+ tmpdir = Dir.mktmpdir
77
82
  Dir.chdir(tmpdir) do
78
83
  scores = @fuser.transrate files, left, right
79
- assert_equal 100, scores.size, "scores size"
84
+ scores.each do |contig, score|
85
+ # puts "#{contig}\t#{score}"
86
+ end
87
+ new_list = @fuser.filter files, scores
88
+ assert_equal 1, new_list.length
89
+ cmd = "grep -c \">\" #{new_list.first}"
90
+ assert_equal 1, `#{cmd}`.chomp.split.first.to_i, "number of contigs"
80
91
  end
81
- end
92
+ # end
93
+
82
94
  end
83
95
 
84
- should "select contigs" do
85
- clusters = {"0" => ["contig1", "contig2"], "1" => ["contig3", "contig4"]}
86
- scores = { "contig1" => 0.2,
87
- "contig2" => 0.3,
88
- "contig3" => 0.4,
89
- "contig4" => 0.2 }
90
- best = @fuser.select_contigs clusters, scores
91
- assert_equal 2, best.size
92
- assert_equal "contig2", best[0]
93
- assert_equal "contig3", best[1]
96
+ should "7 get consensus of clusters" do
97
+
94
98
  end
95
99
 
96
- should "output contigs" do
97
- best = ["soap_contig173359", "oases_contig80246"]
98
- file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
99
- Dir.mktmpdir do |tmpdir|
100
- Dir.chdir(tmpdir) do
101
- @fuser.output_contigs best, file, "out"
102
- end
103
- end
100
+ should "8 not fail when there are duplicated kmers in the input sequences" do
101
+
104
102
  end
105
103
 
106
104
  end
data/transfuse.gemspec CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
19
19
  gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
20
20
  gem.add_dependency 'fixwhich', '~> 1.0', '>= 1.0.2'
21
21
  gem.add_dependency 'bindeps', '~> 1.0', '>= 1.0.1'
22
- gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.0'
22
+ gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.1'
23
23
 
24
24
  gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
25
25
  gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: transfuse
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Smith-Unna
@@ -94,7 +94,7 @@ dependencies:
94
94
  version: '1.0'
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
- version: 1.0.0
97
+ version: 1.0.1
98
98
  type: :runtime
99
99
  prerelease: false
100
100
  version_requirements: !ruby/object:Gem::Requirement
@@ -104,7 +104,7 @@ dependencies:
104
104
  version: '1.0'
105
105
  - - ">="
106
106
  - !ruby/object:Gem::Version
107
- version: 1.0.0
107
+ version: 1.0.1
108
108
  - !ruby/object:Gem::Dependency
109
109
  name: rake
110
110
  requirement: !ruby/object:Gem::Requirement
@@ -210,7 +210,6 @@ extra_rdoc_files: []
210
210
  files:
211
211
  - ".gitignore"
212
212
  - Gemfile
213
- - Gemfile.lock
214
213
  - README.md
215
214
  - Rakefile
216
215
  - bin/transfuse
@@ -218,6 +217,7 @@ files:
218
217
  - lib/transfuse.rb
219
218
  - lib/transfuse/cluster.rb
220
219
  - lib/transfuse/cmd.rb
220
+ - lib/transfuse/consensus.rb
221
221
  - lib/transfuse/transfuse.rb
222
222
  - lib/transfuse/version.rb
223
223
  - notes.md
@@ -248,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
248
248
  version: '0'
249
249
  requirements: []
250
250
  rubyforge_project:
251
- rubygems_version: 2.4.6
251
+ rubygems_version: 2.2.2
252
252
  signing_key:
253
253
  specification_version: 4
254
254
  summary: Merge assemblies
data/Gemfile.lock DELETED
@@ -1,87 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- transfuse (0.1.1)
5
- bindeps (~> 1.0, >= 1.0.1)
6
- bio (~> 1.4, >= 1.4.3)
7
- fixwhich (~> 1.0, >= 1.0.2)
8
- transrate (= 1.0.0.beta3)
9
- trollop (~> 2.0)
10
-
11
- GEM
12
- remote: https://rubygems.org/
13
- specs:
14
- ansi (1.5.0)
15
- bindeps (1.1.2)
16
- fixwhich (~> 1.0, >= 1.0.2)
17
- bio (1.4.3.0001)
18
- coveralls (0.8.1)
19
- json (~> 1.8)
20
- rest-client (>= 1.6.8, < 2)
21
- simplecov (~> 0.10.0)
22
- term-ansicolor (~> 1.3)
23
- thor (~> 0.19.1)
24
- crb-blast (0.6.4)
25
- bindeps (~> 1.0, >= 1.0.3)
26
- bio (~> 1.4, >= 1.4.3)
27
- fixwhich (~> 1.0, >= 1.0.2)
28
- threach (~> 0.2, >= 0.2.0)
29
- trollop (~> 2.0)
30
- docile (1.1.5)
31
- domain_name (0.5.24)
32
- unf (>= 0.0.5, < 1.0.0)
33
- facade (1.0.6)
34
- fix-trinity-output (1.0.0)
35
- trollop (~> 2.0)
36
- fixwhich (1.0.2)
37
- pathname2 (~> 1.4, >= 1.4.4)
38
- http-cookie (1.0.2)
39
- domain_name (~> 0.5)
40
- json (1.8.3)
41
- mime-types (2.6.1)
42
- minitest (4.7.5)
43
- netrc (0.10.3)
44
- pathname2 (1.7.3)
45
- facade
46
- rake (10.4.2)
47
- rest-client (1.8.0)
48
- http-cookie (>= 1.0.2, < 2.0)
49
- mime-types (>= 1.16, < 3.0)
50
- netrc (~> 0.7)
51
- shoulda-context (1.2.1)
52
- simplecov (0.10.0)
53
- docile (~> 1.1.0)
54
- json (~> 1.8)
55
- simplecov-html (~> 0.10.0)
56
- simplecov-html (0.10.0)
57
- term-ansicolor (1.3.0)
58
- tins (~> 1.0)
59
- thor (0.19.1)
60
- threach (0.2.0)
61
- tins (1.5.2)
62
- transrate (1.0.0.beta3)
63
- bindeps (~> 1.1, >= 1.1.2)
64
- bio (~> 1.4, >= 1.4.3)
65
- crb-blast (~> 0.5, >= 0.5.0)
66
- fix-trinity-output (~> 1.0, >= 1.0)
67
- trollop (~> 2.0, >= 2.0.0)
68
- yell (~> 2.0, >= 2.0.4)
69
- trollop (2.1.1)
70
- turn (0.9.7)
71
- ansi
72
- minitest (~> 4)
73
- unf (0.1.4)
74
- unf_ext
75
- unf_ext (0.0.7.1)
76
- yell (2.0.5)
77
-
78
- PLATFORMS
79
- ruby
80
-
81
- DEPENDENCIES
82
- coveralls (~> 0.7)
83
- rake (~> 10.3, >= 10.3.2)
84
- shoulda-context (~> 1.2, >= 1.2.1)
85
- simplecov (~> 0.8, >= 0.8.2)
86
- transfuse!
87
- turn (~> 0.9, >= 0.9.7)