transfuse 0.1.4 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5b17813bba553fa78a7e3cf24366981e368ebfe
4
- data.tar.gz: f57a5011b3296862518ce54a7f3d4bb928e05aab
3
+ metadata.gz: 03b84ea2236b7c43ce321f64352bb6cb7b7035d4
4
+ data.tar.gz: ed70d9a9c204a06b550a7b95c01c74afdcb98b51
5
5
  SHA512:
6
- metadata.gz: 52af21dbf0c5b49aa746d515a214229bccd78a55d5998ddc929b2594d8abe5bafcb9cd1bbc98ec6ea1a41a5cb3240666616837d3d06c5cbacd25acca3fe392ca
7
- data.tar.gz: 147cc022f22ad319ff0651a6092ced5d3f1c7e99ab3989a8091970ab5c25a97e42d9b98f7384e28712ef802c4ced8090c3942ba66105963013eb30e46c3b58cb
6
+ metadata.gz: 6074f06d8afd9e33fce7b77e0221ffc8a300da966cc7916f86eb245d02c1e2b5519a831e1c99962fcfd5b085b5cd3d94765297ec94469270e4246c6c9ab71830
7
+ data.tar.gz: d992bac7cb1a98cd523e4df28d5b684f42bf8d8d7deb7882d42ecd8bb50bb137a190200006c7528f834d9e30a214b1f304a46b8502176993e22b019658c19061
data/README.md CHANGED
@@ -1 +1,42 @@
1
- Transfuse
1
+ ## Transfuse
2
+
3
+ **Transfuse is currently in development and is not yet ready for use**
4
+
5
+ Transfuse intelligently merges your multiple de novo transcriptome assemblies. Run multiple assemblies with different de novo assemblers, or different settings in the same assembler and have them combined into a single high quality transcriptome.
6
+
7
+ Transfuse takes in the reads you used to do the assembly and a list of fasta files and produces a single output fasta file.
8
+
9
+ ### Installation and Running
10
+
11
+ To install Transfuse, clone this repo:
12
+
13
+ `git clone https://github.com/cboursnell/transfuse.git`
14
+
15
+ Then build and install the ruby gem
16
+
17
+ `gem build *spec; gem install *gem`
18
+
19
+ ### Usage
20
+
21
+ Transfuse is run on the command line. The options are:
22
+
23
+ ```
24
+ -a, --assembly=<s> assembly files in FASTA format, comma-separated
25
+ -l, --left=<s> left reads file in FASTQ format
26
+ -r, --right=<s> right reads file in FASTQ format
27
+ -o, --output=<s> write merged assembly to file
28
+ -t, --threads=<i> number of threads (default: 1)
29
+ -v, --verbose be verbose
30
+ -e, --version Print version and exit
31
+ -h, --help Show this message
32
+ ```
33
+
34
+ An example command:
35
+
36
+ `transfuse --assembly soap-k31.fa,soap-k41.fa,soap-k51.fa --left reads_1.fq --right reads_2.fq --output soap-merged.fa --threads 12`
37
+
38
+ ### License
39
+
40
+ This is adademic software - please cite us if you use it in your work.
41
+
42
+ Transfuse is released under the MIT license.
data/Rakefile CHANGED
@@ -5,15 +5,21 @@ Rake::TestTask.new do |t|
5
5
  end
6
6
 
7
7
  Rake::TestTask.new do |t|
8
- t.name = :corset
8
+ t.name = :cluster
9
9
  t.libs << 'test'
10
- t.test_files = ['test/test_corset.rb']
10
+ t.test_files = ['test/test_cluster.rb']
11
11
  end
12
12
 
13
13
  Rake::TestTask.new do |t|
14
- t.name = :cluster
14
+ t.name = :fuse
15
15
  t.libs << 'test'
16
- t.test_files = ['test/test_cluster.rb']
16
+ t.test_files = ['test/test_transfuse.rb']
17
+ end
18
+
19
+ Rake::TestTask.new do |t|
20
+ t.name = :cons
21
+ t.libs << 'test'
22
+ t.test_files = ['test/test_consensus.rb']
17
23
  end
18
24
 
19
25
  desc "Run tests"
data/bin/transfuse CHANGED
@@ -22,23 +22,24 @@ opts = Trollop::options do
22
22
  OPTIONS:
23
23
 
24
24
  EOS
25
- opt :assembly, "assembly files in FASTA format, comma-separated",
25
+ opt :assemblies, "assembly files in FASTA format, comma-separated",
26
26
  :type => String, :required => true
27
- opt :scores, "transrate contig score output files, comma-separated",
28
- :type => String
29
27
  opt :left, "left reads file in FASTQ format",
30
28
  :type => String
31
29
  opt :right, "right reads file in FASTQ format",
32
30
  :type => String
31
+ opt :scores, "transrate contig score output files, comma-separated. Ignored if reads are provided",
32
+ :type => String
33
33
  opt :output, "write merged assembly to file",
34
34
  :type => String, :required => :true
35
35
  opt :threads, "number of threads", :type => :int, :default => 1
36
+ opt :id, "sequence identity to cluster at", :type => :float, :default => 1.0
36
37
  opt :verbose, "be verbose"
37
38
  end
38
39
 
39
40
  transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
40
41
 
41
- assembly_files = transfuse.check_files opts.assembly
42
+ assembly_files = transfuse.check_files opts.assemblies
42
43
  score_files = transfuse.check_files opts.score if opts.score
43
44
  left = transfuse.check_files opts.left if opts.left
44
45
  right = transfuse.check_files opts.right if opts.right
@@ -54,6 +55,7 @@ else
54
55
  abort msg
55
56
  end
56
57
 
58
+ # filter out assemblies with low score
57
59
  assembly_files = transfuse.filter assembly_files, scores
58
60
 
59
61
  # concatenate assemblies into one fasta file
@@ -62,12 +64,11 @@ cat = transfuse.concatenate assembly_files
62
64
  # load fasta sequences from concatenated file into hash
63
65
  transfuse.load_fasta cat
64
66
 
65
- # cluster using vsearch or maybe cd-hit-est
66
- clusters = transfuse.cluster cat
67
-
68
- transfuse.sequence_alignment clusters
69
- # pull out contigs from each cluster based on the scores
70
- # best = transfuse.select_contigs clusters, scores
67
+ # cluster using vsearch
68
+ msa = transfuse.cluster cat, opts.id
71
69
 
72
- # transfuse.output_contigs best, cat, opts.output
70
+ # read the msa from vsearch and produce a consensus fasta
71
+ cons = transfuse.consensus msa, scores, opts.output
73
72
 
73
+ # transrate the consensus output to remove low scoring contigs
74
+ transfuse.transrate_consensus cons, opts.output, left, right
data/deps/deps.yaml CHANGED
@@ -0,0 +1,11 @@
1
+ vsearch:
2
+ binaries:
3
+ - vsearch
4
+ version:
5
+ number: '1.1.3'
6
+ command: 'vsearch --version'
7
+ url:
8
+ 64bit:
9
+ linux: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-linux-x86_64
10
+ macosx: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-osx-x86_64
11
+ unpack: false
@@ -5,61 +5,31 @@ module Transfuse
5
5
 
6
6
  class Cluster
7
7
 
8
- def initialize threads, verbose
9
- @cdhit = Which::which('cd-hit-est').first
10
- raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
8
+ def initialize threads, verbose, id
11
9
  @vsearch = Which::which('vsearch').first
12
10
  raise "vsearch was not in the PATH - please install it" unless @vsearch
13
- @id = "1.00"
11
+ @id = id.to_s
14
12
  @threads = threads
15
13
  @verbose = verbose
16
14
  end
17
15
 
18
16
  def run fasta
19
- use_cd_hit = false
20
- if use_cd_hit
21
- output = cd_hit fasta
22
- return parse_output output
23
- else
24
- output = vsearch fasta
25
- return parse_vsearch_output output
26
- end
27
- end
28
-
29
- def cd_hit fasta
30
- puts "running cd-hit-est" if @verbose
31
- output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
32
- cdhit_cmd = generate_cdhit_command fasta, output
33
- puts cdhit_cmd if @verbose
34
- cluster = Cmd.new cdhit_cmd
35
- cluster.run output
36
- return "#{output}.clstr"
17
+ cluster_output, msa_output = vsearch fasta
18
+ return parse_vsearch_output(cluster_output, msa_output)
37
19
  end
38
20
 
39
21
  def vsearch fasta
40
- puts "running vsearch" if @verbose
41
- cluster_output = "#{fasta}.clust"
42
- vsearch_cmd = generate_vsearch_command fasta, cluster_output
22
+ print "running vsearch..." if @verbose
23
+ cluster_output = "#{File.basename(fasta)}-#{@id}.clust"
24
+ msa_output = "#{File.basename(fasta)}-#{@id}.aln"
25
+ vsearch_cmd = generate_vsearch_command fasta, cluster_output, msa_output
43
26
  cluster = Cmd.new vsearch_cmd
44
27
  cluster.run cluster_output
45
- return cluster_output
46
- end
47
-
48
- def generate_cdhit_command fasta, out
49
- #cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
50
- cmd = "#{@cdhit}"
51
- cmd << " -i #{fasta}"
52
- cmd << " -o #{out}"
53
- cmd << " -c #{@id}" # similarity = number of identical bases /
54
- # length of shorter sequences
55
- cmd << " -T #{@threads}"
56
- cmd << " -n 10" # word length - maybe increase??
57
- cmd << " -d 100" # output name width
58
- cmd << " -g 1" # slower but more accurate mode
59
- cmd << " -M 8000" # increase memory
28
+ puts " Done. Created #{cluster_output}" if @verbose
29
+ return [cluster_output, msa_output]
60
30
  end
61
31
 
62
- def generate_vsearch_command fasta, out
32
+ def generate_vsearch_command fasta, out, msa
63
33
  vsearch = "#{@vsearch}"
64
34
  vsearch << " --cluster_fast #{fasta}"
65
35
  vsearch << " --id #{@id}"
@@ -67,45 +37,60 @@ module Transfuse
67
37
  vsearch << " --qmask none" # no masking
68
38
  vsearch << " --strand both"
69
39
  vsearch << " --uc #{out}"
40
+ vsearch << " --msaout #{msa}"
70
41
  vsearch << " --threads #{@threads}"
71
42
  return vsearch
72
43
  end
73
44
 
74
- def parse_output cluster_output
75
- puts "parsing cd-hit output #{cluster_output}" if @verbose
76
- cluster_id = 0
77
- clusters = {}
78
- File.open(cluster_output).each_line do |line|
79
- if line =~ />Cluster\ ([0-9]+)/
80
- cluster_id = $1.to_i
81
- elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
82
- contig_name = $1
83
- strand = $2
84
- id = $3.to_f
85
- clusters[cluster_id] ||= []
86
- clusters[cluster_id] << { :name => contig_name, :strand => strand }
87
- elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
88
- contig_name = $1
89
- strand = "+"
90
- clusters[cluster_id] ||= []
91
- clusters[cluster_id] << { :name => contig_name, :strand => strand }
92
- end
93
- end
94
- return clusters
95
- end
96
-
97
- def parse_vsearch_output cluster_output
45
+ def parse_vsearch_output cluster_output, msa_output
46
+ print "parsing vsearch output" if @verbose
98
47
  clusters = {}
48
+ lookup = {}
49
+ second = 0
50
+ count = 0
99
51
  File.open(cluster_output).each_line do |line|
52
+ count+=1
100
53
  if line.start_with?("S") or line.start_with?("H")
101
54
  cols = line.chomp.split("\t")
102
- cluster = cols[1].to_i
55
+ cluster = cols[1]
56
+ len = cols[2].to_i
57
+ cigar = cols[7]
58
+ strand = cols[4]
59
+ strand = "+" if strand == "*"
103
60
  contig_name = cols[8]
61
+
104
62
  clusters[cluster] ||= []
105
- clusters[cluster] << contig_name
63
+ clusters[cluster] << { :name => contig_name, :strand => strand }
64
+ lookup[contig_name] = cluster
65
+ end
66
+ if count%10_000==0 and @verbose
67
+ print "."
106
68
  end
107
69
  end
108
- return clusters
70
+ puts " Done" if @verbose
71
+ print "parsing msa output " if @verbose
72
+ count = 0
73
+ msa = {}
74
+ Bio::FastaFormat.open(msa_output).each do |entry|
75
+ count += 1
76
+ name = entry.entry_id
77
+ if name != "consensus"
78
+ # name = name[1..-1]
79
+ if name[0]=="*"
80
+ name = name[1..-1]
81
+ end
82
+ # what cluster is name in?
83
+ cluster = lookup[name]
84
+ msa[cluster] ||= []
85
+ msa[cluster] << { :name => name, :seq => entry.seq.seq }
86
+ end
87
+ if count%10_000==0 and @verbose
88
+ print "."
89
+ end
90
+
91
+ end
92
+ puts " Done" if @verbose
93
+ return msa
109
94
  end
110
95
 
111
96
  end
data/lib/transfuse/cmd.rb CHANGED
@@ -18,7 +18,7 @@ module Transfuse
18
18
 
19
19
  def run file=nil
20
20
  unless file.nil?
21
- if File.exist?(file)
21
+ if File.exist?(file) and File.stat(file).size > 0
22
22
  @stdout = ""
23
23
  @stderr = ""
24
24
  @status = Status.new
@@ -0,0 +1,105 @@
1
+
2
+ require 'bio'
3
+ require 'set'
4
+
5
+ module Transfuse
6
+
7
+ class Consensus
8
+
9
+ attr_reader :contigs
10
+
11
+ def initialize verbose
12
+ @verbose = verbose
13
+ end
14
+
15
+ def run msa, scores, output
16
+ return 1 if File.exist?(output)
17
+ print "writing consensus " if @verbose
18
+ # msa is a hash
19
+ # key = cluster id
20
+ # value = list
21
+ # list of sequences in cluster aligned with gaps
22
+ preoutput = "#{File.basename(output, File.extname(output))}_cons.fa"
23
+ count = 0
24
+ File.open("#{output}.data", "w") do |out2|
25
+ File.open(preoutput, "w") do |out|
26
+ msa.each do |id, list|
27
+ count+=1
28
+ print "." if count%5_000==0 and @verbose
29
+ exons={}
30
+ cons = []
31
+ length = list[0][:seq].length
32
+ list.each_with_index do |hash, index|
33
+ seq = hash[:seq]
34
+ name = hash[:name]
35
+ out2.write "#{id}\t#{scores[name][:score]}\t#{name}\n"
36
+ prev = ""
37
+ gap = 0
38
+ exon = 0
39
+ seq.each_char do |c|
40
+ if c=="-"
41
+ base="-"
42
+ else
43
+ base="*"
44
+ end
45
+ if base!=prev
46
+ if c=="-"
47
+ gap+=1
48
+ else
49
+ exon+=1
50
+ end
51
+ end
52
+ if c=="-"
53
+ prev = "-"
54
+ else
55
+ prev = "*"
56
+ end
57
+ end
58
+ exons[index] = exon
59
+ end
60
+
61
+ consensus = ""
62
+ 0.upto(length-1) do |i|
63
+ base="N"
64
+ counts = {}
65
+ list.each_with_index do |hash, index|
66
+ seq = hash[:seq]
67
+ if seq[i] != "-" and seq[i] != "N"
68
+ counts[seq[i]]||=0
69
+ counts[seq[i]] += 1
70
+ if exons[index]==1
71
+ base = seq[i]
72
+ end
73
+ end
74
+ end
75
+ if counts.size>0
76
+ base = counts.sort.last.first
77
+ end
78
+ consensus << base
79
+ end
80
+
81
+ if consensus.count("N") < consensus.length.to_f*0.5
82
+ cons << consensus
83
+ end
84
+
85
+ list.each_with_index do |hash, index|
86
+ if exons[index] > 1
87
+ cons << hash[:seq].delete("-")
88
+ end
89
+ end
90
+
91
+ cons.each_with_index do |s,index|
92
+ out.write ">contig#{id}.#{index+1}\n"
93
+ out.write "#{s}\n"
94
+ end
95
+
96
+ end # msa.each
97
+ end # file
98
+ end # file open
99
+ puts " Done" if @verbose
100
+ return preoutput
101
+ end # def
102
+
103
+ end
104
+
105
+ end
@@ -6,27 +6,28 @@ end
6
6
 
7
7
  module Transfuse
8
8
 
9
+ require 'bio'
9
10
  require 'csv'
10
11
  require 'transrate'
12
+ require 'threach'
11
13
 
12
14
  class Transfuse
13
15
 
14
16
  def initialize threads, verbose
15
17
  @threads = threads
16
18
  @verbose = verbose
17
- @clustalo = Which::which('clustalo').first
18
- raise "clustalo was not in the PATH - please install it" unless @clustalo
19
19
  end
20
20
 
21
21
  def check_files string
22
+ # puts "check file string: #{string}" if @verbose
22
23
  list = []
23
24
  string.split(",").each do |file|
24
25
  file = File.expand_path(file)
25
26
  if File.exist?(file)
26
- puts "#{file} exists" if @verbose
27
+ puts "#{File.basename(file)} exists" if @verbose
27
28
  list << file
28
29
  else
29
- abort "#{file} not found"
30
+ abort "#{File.basename(file)} not found"
30
31
  end
31
32
  end
32
33
  return list
@@ -51,42 +52,27 @@ module Transfuse
51
52
  return File.expand_path(catted_fasta)
52
53
  end
53
54
 
54
- def cluster file
55
- puts "clustering #{file}" if @verbose
56
- cluster = Cluster.new @threads, @verbose
57
- return cluster.run file
58
- end
59
-
60
55
  def load_fasta fasta
56
+ print "loading fasta sequence #{fasta}..." if @verbose
61
57
  @sequences = {}
58
+ count = 1
62
59
  Bio::FastaFormat.open(fasta).each do |entry|
63
60
  @sequences[entry.entry_id] = entry.seq.to_s
61
+ print "." if count%10_000==0 and @verbose
62
+ count +=1
64
63
  end
64
+ puts " Done" if @verbose
65
65
  end
66
66
 
67
- def sequence_alignment clusters
68
- clusters.each do |id, list| # threach
69
- if list.size > 5
70
- seq = ""
71
- list.each do |hash|
72
- seq << ">#{hash[:name]}\n"
73
- if hash[:strand] == "+"
74
- seq << "#{@sequences[hash[:name]]}\n"
75
- elsif hash[:strand] == "-"
76
- seq << "#{@sequences[hash[:name]].revcomp}\n"
77
- else
78
- abort "Unknown strand #{hash[:strand]}"
79
- end
80
- end
81
- cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
82
- cmd << "--output-order tree-order"
83
- align = Cmd.new cmd
84
- align.run
85
- File.open("cluster#{id}.fa", "wb") do |out|
86
- out.write align.stdout
87
- end
88
- end
89
- end
67
+ def cluster file, id
68
+ puts "clustering #{file}" if @verbose
69
+ cluster = Cluster.new @threads, @verbose, id
70
+ return cluster.run file
71
+ end
72
+
73
+ def consensus msa, scores, output
74
+ cons = Consensus.new(@verbose)
75
+ return cons.run(msa, scores, output)
90
76
  end
91
77
 
92
78
  def load_scores files
@@ -96,8 +82,10 @@ module Transfuse
96
82
  :header_converters => :symbol,
97
83
  :converters => :all) do |row|
98
84
  name = row[:contig_name]
99
- score = row[:score]
100
- scores[name] = score
85
+ scores[name] = { :score => row[:score].to_f,
86
+ :p_good => row[:p_good].to_f,
87
+ :p_bases_covered => row[:p_bases_covered].to_f,
88
+ :coverage => row[:coverage].to_f }
101
89
  end
102
90
  end
103
91
  return scores
@@ -107,13 +95,15 @@ module Transfuse
107
95
  filtered_files = []
108
96
  files.each_with_index do |file, index|
109
97
  new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
110
- unless File.exist?(new_filename)
98
+ if !File.exist?(new_filename) or File.stat(new_filename).size < 1
111
99
  File.open(new_filename, "wb") do |out|
112
- puts "opening #{file}..."
100
+ puts "filtering #{file}..." if @verbose
113
101
  Bio::FastaFormat.open(file).each do |entry|
114
102
  contig_name = entry.entry_id
115
103
  contig_name = "contig#{index}_#{contig_name}"
116
- if scores.key?(contig_name) and scores[contig_name] > 0.01
104
+ if scores.key?(contig_name) and
105
+ scores[contig_name][:score] > 0.01 and
106
+ scores[contig_name][:coverage] >= 1
117
107
  out.write ">#{contig_name}\n"
118
108
  out.write "#{entry.seq}\n"
119
109
  elsif !scores.key?(contig_name)
@@ -127,75 +117,105 @@ module Transfuse
127
117
  return filtered_files
128
118
  end
129
119
 
120
+ def transrate_consensus file, output, left, right
121
+ output = File.expand_path(output)
122
+ puts "transrate on #{file}" if @verbose
123
+ file = File.expand_path(file)
124
+ name = File.basename(file, File.extname(file))
125
+ dir = "transrate_#{name}"
126
+ Dir.mkdir(dir) unless Dir.exist?(dir)
127
+ Dir.chdir(dir) do
128
+ assembly = Transrate::Assembly.new(file)
129
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
130
+ rename = "assembly_#{name}_score_optimisation.csv"
131
+ rm = transrater.read_metrics(left.join(','), right.join(','))
132
+ stats = rm.read_stats
133
+ File.rename("assembly_score_optimisation.csv", rename)
134
+ scores={}
135
+ assembly.each do |name, contig|
136
+ scores[name] = { :score => contig.score.to_f,
137
+ :p_good => contig.p_good.to_f,
138
+ :p_bases_covered => contig.p_bases_covered.to_f,
139
+ :coverage => contig.coverage.to_f }
140
+ end
141
+ scores_file = "#{name}_scores.csv"
142
+ stats_file = "../#{name}_stats.txt"
143
+ puts " writing scores" if @verbose
144
+ File.open(scores_file, "wb") do |out|
145
+ scores.each do |name, hash|
146
+ out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
147
+ out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
148
+ end
149
+ end
150
+ puts " writing filtered fasta file" if @verbose
151
+ File.open(output, "wb") do |out|
152
+ assembly.each do |name, contig|
153
+ if contig.score.to_f > 0.01 and contig.coverage.to_f >= 1
154
+ out.write ">#{name}\n"
155
+ out.write "#{contig.seq.seq}\n"
156
+ end
157
+ end
158
+ end
159
+ puts " writing stats" if @verbose
160
+ File.open(stats_file, "wb") do |out|
161
+ stats.each do |key, value|
162
+ out.write "#{key}\t#{value}\n"
163
+ end
164
+ out.write "assembly score:\t#{transrater.assembly_score}\n"
165
+ optimal = transrater.assembly_optimal_score("prefix")
166
+ out.write "optimal score :\t#{optimal[0]}\n"
167
+ out.write "cutoff :\t#{optimal[1]}\n"
168
+ end
169
+ end
170
+ end
130
171
 
131
172
  def transrate files, left, right
132
173
  scores = {}
133
- scores_file = "scores.csv"
174
+ shortname = ""
175
+ files.each do |n|
176
+ shortname << File.basename(n, File.extname(n))[0..4]
177
+ end
178
+ scores_file = "#{shortname}_scores.csv"
134
179
  if File.exist?(scores_file)
135
180
  puts "loading scores from file" if @verbose
136
181
  File.open(scores_file).each do |line|
137
- name, score = line.chomp.split("\t")
138
- scores[name] = score.to_f
182
+ name, score, p_good, p_bases_covered, coverage = line.chomp.split("\t")
183
+ scores[name] = { :score => score.to_f,
184
+ :p_good => p_good.to_f,
185
+ :p_bases_covered => p_bases_covered.to_f,
186
+ :coverage => coverage.to_f }
139
187
  end
140
188
  else
141
189
  files.each_with_index do |fasta, index|
142
190
  puts "transrate on #{fasta}" if @verbose
143
- assembly = Transrate::Assembly.new(fasta)
144
- transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
145
- transrater.read_metrics(left.join(','), right.join(','))
146
- assembly.each do |name, contig|
147
- name = "contig#{index}_#{name}"
148
- scores[name] = contig.score
191
+ dir = "transrate_#{File.basename(fasta, File.extname(fasta))}"
192
+ Dir.mkdir(dir) unless Dir.exist?(dir)
193
+ Dir.chdir(dir) do
194
+ assembly = Transrate::Assembly.new(fasta)
195
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
196
+ rename = "assembly#{index}_score_optimisation.csv"
197
+ transrater.read_metrics(left.join(','), right.join(','))
198
+ File.rename("assembly_score_optimisation.csv", rename)
199
+ assembly.each do |name, contig|
200
+ name = "contig#{index}_#{name}"
201
+ scores[name] = { :score => contig.score.to_f,
202
+ :p_good => contig.p_good.to_f,
203
+ :p_bases_covered => contig.p_bases_covered.to_f,
204
+ :coverage => contig.coverage.to_f }
205
+
206
+ end
149
207
  end
150
208
  end
151
209
  File.open(scores_file, "wb") do |out|
152
- scores.each do |name, score|
153
- out.write "#{name}\t#{score}\n"
210
+ scores.each do |name, hash|
211
+ out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
212
+ out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
154
213
  end
155
214
  end
156
215
  end
157
216
  return scores
158
217
  end
159
218
 
160
- def select_contigs clusters, scores
161
- puts "selecting contigs" if @verbose
162
- best = []
163
- clusters.each do |cluster_id, list|
164
- best_score = 0
165
- best_contig = ""
166
- list.each do |contig_name|
167
- unless scores[contig_name]
168
- abort "can't find #{contig_name} in scores hash\n"
169
- end
170
- if scores[contig_name] > best_score
171
- best_score = scores[contig_name]
172
- best_contig = contig_name
173
- end
174
- end
175
- best << best_contig
176
- end
177
- return best
178
- end
179
-
180
- def output_contigs best, fasta, output
181
- puts "writing contigs" if @verbose
182
- # read in catted fasta sequences
183
- sequences = {}
184
- Bio::FastaFormat.open(fasta).each do |entry|
185
- sequences[entry.entry_id] = entry.seq
186
- end
187
- File.open(output, "wb") do |out|
188
- best.each do |contig_name|
189
- if sequences.key?(contig_name)
190
- out.write ">#{contig_name}\n"
191
- out.write "#{sequences[contig_name]}\n"
192
- else
193
- puts "can't find #{contig_name} in #{fasta}"
194
- end
195
- end
196
- end
197
- end
198
-
199
219
  end
200
220
 
201
221
  end
@@ -7,8 +7,8 @@ module Transfuse
7
7
  # Semantic Versioning 2.0 (http://semver.org/).
8
8
  module VERSION
9
9
  MAJOR = 0
10
- MINOR = 1
11
- PATCH = 4
10
+ MINOR = 4
11
+ PATCH = 2
12
12
  BUILD = nil
13
13
 
14
14
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
data/lib/transfuse.rb CHANGED
@@ -1,4 +1,5 @@
1
- require 'transfuse/cluster.rb'
2
- require 'transfuse/cmd.rb'
3
- require 'transfuse/transfuse.rb'
4
- require 'transfuse/version.rb'
1
+ require 'transfuse/cluster'
2
+ require 'transfuse/cmd'
3
+ require 'transfuse/consensus'
4
+ require 'transfuse/transfuse'
5
+ require 'transfuse/version'
@@ -8,13 +8,13 @@ class TestTransfuse < Test::Unit::TestCase
8
8
  context 'transfuse' do
9
9
 
10
10
  setup do
11
- @fuser = Transfuse::Transfuse.new 4
11
+ @fuser = Transfuse::Transfuse.new 4, true
12
12
  end
13
13
 
14
14
  teardown do
15
15
  end
16
16
 
17
- should 'check for existence of files' do
17
+ should '1 check for existence of files' do
18
18
  list = []
19
19
  list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
20
20
  list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -22,7 +22,7 @@ class TestTransfuse < Test::Unit::TestCase
22
22
  assert_equal 2, files.length, "length"
23
23
  end
24
24
 
25
- should "concatenate two files" do
25
+ should "2 concatenate two files" do
26
26
  list = []
27
27
  list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
28
28
  list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -36,71 +36,69 @@ class TestTransfuse < Test::Unit::TestCase
36
36
  end
37
37
  end
38
38
 
39
- should "cluster fasta file" do
40
- Dir.mktmpdir do |tmpdir|
39
+ should "3 cluster fasta file" do
40
+ # Dir.mktmpdir do |tmpdir|
41
+ tmpdir = Dir.mktmpdir
41
42
  Dir.chdir(tmpdir) do
42
43
  file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
43
44
  hash = @fuser.cluster file
44
45
  assert_equal 250, hash.size, "output size"
45
46
  end
46
- end
47
+ # end
47
48
  end
48
49
 
49
- should "load scores from transrate output" do
50
+ should "4 load scores from transrate output" do
50
51
  files = []
51
52
  files << File.join(File.dirname(__FILE__), 'data', 'contig_scores1.csv')
52
53
  hash = @fuser.load_scores files
53
54
  assert_equal 99, hash.size
54
55
  end
55
56
 
56
- should "filter contigs" do
57
+ should "5 run transrate on assembly files with reads" do
57
58
  files = []
58
- scores = {}
59
- files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
60
- scores["soap_contig173359"] = 1
61
- scores["soap_contig38533"] = 0.5
62
- scores["idba_contig44716"] = 0
63
- new_list = @fuser.filter files, scores
64
- assert_equal 1, new_list.length
65
- cmd = "grep -c \">\" #{new_list.first}"
66
- assert_equal 2, `#{cmd}`.chomp.split.first.to_i, "contigs"
59
+ left = []
60
+ right = []
61
+ files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
62
+ left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
63
+ right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
64
+ # Dir.mktmpdir do |tmpdir|
65
+ tmpdir = Dir.mktmpdir
66
+ Dir.chdir(tmpdir) do
67
+ scores = @fuser.transrate files, left, right
68
+ assert_equal 100, scores.size, "scores size"
69
+ end
70
+ # end
67
71
  end
68
72
 
69
- should "run transrate on assembly files with reads" do
73
+ should "6 filter contigs" do
70
74
  files = []
71
75
  left = []
72
76
  right = []
73
- files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
77
+ files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
74
78
  left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
75
79
  right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
76
- Dir.mktmpdir do |tmpdir|
80
+ # Dir.mktmpdir do |tmpdir|
81
+ tmpdir = Dir.mktmpdir
77
82
  Dir.chdir(tmpdir) do
78
83
  scores = @fuser.transrate files, left, right
79
- assert_equal 100, scores.size, "scores size"
84
+ scores.each do |contig, score|
85
+ # puts "#{contig}\t#{score}"
86
+ end
87
+ new_list = @fuser.filter files, scores
88
+ assert_equal 1, new_list.length
89
+ cmd = "grep -c \">\" #{new_list.first}"
90
+ assert_equal 1, `#{cmd}`.chomp.split.first.to_i, "number of contigs"
80
91
  end
81
- end
92
+ # end
93
+
82
94
  end
83
95
 
84
- should "select contigs" do
85
- clusters = {"0" => ["contig1", "contig2"], "1" => ["contig3", "contig4"]}
86
- scores = { "contig1" => 0.2,
87
- "contig2" => 0.3,
88
- "contig3" => 0.4,
89
- "contig4" => 0.2 }
90
- best = @fuser.select_contigs clusters, scores
91
- assert_equal 2, best.size
92
- assert_equal "contig2", best[0]
93
- assert_equal "contig3", best[1]
96
+ should "7 get consensus of clusters" do
97
+
94
98
  end
95
99
 
96
- should "output contigs" do
97
- best = ["soap_contig173359", "oases_contig80246"]
98
- file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
99
- Dir.mktmpdir do |tmpdir|
100
- Dir.chdir(tmpdir) do
101
- @fuser.output_contigs best, file, "out"
102
- end
103
- end
100
+ should "8 not fail when there are duplicated kmers in the input sequences" do
101
+
104
102
  end
105
103
 
106
104
  end
data/transfuse.gemspec CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
19
19
  gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
20
20
  gem.add_dependency 'fixwhich', '~> 1.0', '>= 1.0.2'
21
21
  gem.add_dependency 'bindeps', '~> 1.0', '>= 1.0.1'
22
- gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.0'
22
+ gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.1'
23
23
 
24
24
  gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
25
25
  gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: transfuse
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Smith-Unna
@@ -94,7 +94,7 @@ dependencies:
94
94
  version: '1.0'
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
- version: 1.0.0
97
+ version: 1.0.1
98
98
  type: :runtime
99
99
  prerelease: false
100
100
  version_requirements: !ruby/object:Gem::Requirement
@@ -104,7 +104,7 @@ dependencies:
104
104
  version: '1.0'
105
105
  - - ">="
106
106
  - !ruby/object:Gem::Version
107
- version: 1.0.0
107
+ version: 1.0.1
108
108
  - !ruby/object:Gem::Dependency
109
109
  name: rake
110
110
  requirement: !ruby/object:Gem::Requirement
@@ -210,7 +210,6 @@ extra_rdoc_files: []
210
210
  files:
211
211
  - ".gitignore"
212
212
  - Gemfile
213
- - Gemfile.lock
214
213
  - README.md
215
214
  - Rakefile
216
215
  - bin/transfuse
@@ -218,6 +217,7 @@ files:
218
217
  - lib/transfuse.rb
219
218
  - lib/transfuse/cluster.rb
220
219
  - lib/transfuse/cmd.rb
220
+ - lib/transfuse/consensus.rb
221
221
  - lib/transfuse/transfuse.rb
222
222
  - lib/transfuse/version.rb
223
223
  - notes.md
@@ -248,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
248
248
  version: '0'
249
249
  requirements: []
250
250
  rubyforge_project:
251
- rubygems_version: 2.4.6
251
+ rubygems_version: 2.2.2
252
252
  signing_key:
253
253
  specification_version: 4
254
254
  summary: Merge assemblies
data/Gemfile.lock DELETED
@@ -1,87 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- transfuse (0.1.1)
5
- bindeps (~> 1.0, >= 1.0.1)
6
- bio (~> 1.4, >= 1.4.3)
7
- fixwhich (~> 1.0, >= 1.0.2)
8
- transrate (= 1.0.0.beta3)
9
- trollop (~> 2.0)
10
-
11
- GEM
12
- remote: https://rubygems.org/
13
- specs:
14
- ansi (1.5.0)
15
- bindeps (1.1.2)
16
- fixwhich (~> 1.0, >= 1.0.2)
17
- bio (1.4.3.0001)
18
- coveralls (0.8.1)
19
- json (~> 1.8)
20
- rest-client (>= 1.6.8, < 2)
21
- simplecov (~> 0.10.0)
22
- term-ansicolor (~> 1.3)
23
- thor (~> 0.19.1)
24
- crb-blast (0.6.4)
25
- bindeps (~> 1.0, >= 1.0.3)
26
- bio (~> 1.4, >= 1.4.3)
27
- fixwhich (~> 1.0, >= 1.0.2)
28
- threach (~> 0.2, >= 0.2.0)
29
- trollop (~> 2.0)
30
- docile (1.1.5)
31
- domain_name (0.5.24)
32
- unf (>= 0.0.5, < 1.0.0)
33
- facade (1.0.6)
34
- fix-trinity-output (1.0.0)
35
- trollop (~> 2.0)
36
- fixwhich (1.0.2)
37
- pathname2 (~> 1.4, >= 1.4.4)
38
- http-cookie (1.0.2)
39
- domain_name (~> 0.5)
40
- json (1.8.3)
41
- mime-types (2.6.1)
42
- minitest (4.7.5)
43
- netrc (0.10.3)
44
- pathname2 (1.7.3)
45
- facade
46
- rake (10.4.2)
47
- rest-client (1.8.0)
48
- http-cookie (>= 1.0.2, < 2.0)
49
- mime-types (>= 1.16, < 3.0)
50
- netrc (~> 0.7)
51
- shoulda-context (1.2.1)
52
- simplecov (0.10.0)
53
- docile (~> 1.1.0)
54
- json (~> 1.8)
55
- simplecov-html (~> 0.10.0)
56
- simplecov-html (0.10.0)
57
- term-ansicolor (1.3.0)
58
- tins (~> 1.0)
59
- thor (0.19.1)
60
- threach (0.2.0)
61
- tins (1.5.2)
62
- transrate (1.0.0.beta3)
63
- bindeps (~> 1.1, >= 1.1.2)
64
- bio (~> 1.4, >= 1.4.3)
65
- crb-blast (~> 0.5, >= 0.5.0)
66
- fix-trinity-output (~> 1.0, >= 1.0)
67
- trollop (~> 2.0, >= 2.0.0)
68
- yell (~> 2.0, >= 2.0.4)
69
- trollop (2.1.1)
70
- turn (0.9.7)
71
- ansi
72
- minitest (~> 4)
73
- unf (0.1.4)
74
- unf_ext
75
- unf_ext (0.0.7.1)
76
- yell (2.0.5)
77
-
78
- PLATFORMS
79
- ruby
80
-
81
- DEPENDENCIES
82
- coveralls (~> 0.7)
83
- rake (~> 10.3, >= 10.3.2)
84
- shoulda-context (~> 1.2, >= 1.2.1)
85
- simplecov (~> 0.8, >= 0.8.2)
86
- transfuse!
87
- turn (~> 0.9, >= 0.9.7)