transfuse 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9301f6cf6f1e24b506789da98e79c1b87cc4cfb7
4
+ data.tar.gz: 066ee9226775d9492a28291d5ee6b16fe79e61e3
5
+ SHA512:
6
+ metadata.gz: 48eb4011a41c92936752b8e7ec1aa53ab12a2737d62b6c0b733f09edf35aebe17e63562513a8a3a60ce8a0cef17201150d8c8432d649f2f8b498eb2f1b9822d3
7
+ data.tar.gz: 2339858d5e0e19184cbf9d259506ab7efd5d786fcb5ca95ffa62db13c481efc08d24fea58b47a6637391a1b7a45740074a76961609c2f3164761621657785d25
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ # directories
2
+
3
+ coverage
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,87 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ transfuse (0.1.1)
5
+ bindeps (~> 1.0, >= 1.0.1)
6
+ bio (~> 1.4, >= 1.4.3)
7
+ fixwhich (~> 1.0, >= 1.0.2)
8
+ transrate (= 1.0.0.beta3)
9
+ trollop (~> 2.0)
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ ansi (1.5.0)
15
+ bindeps (1.1.2)
16
+ fixwhich (~> 1.0, >= 1.0.2)
17
+ bio (1.4.3.0001)
18
+ coveralls (0.8.1)
19
+ json (~> 1.8)
20
+ rest-client (>= 1.6.8, < 2)
21
+ simplecov (~> 0.10.0)
22
+ term-ansicolor (~> 1.3)
23
+ thor (~> 0.19.1)
24
+ crb-blast (0.6.4)
25
+ bindeps (~> 1.0, >= 1.0.3)
26
+ bio (~> 1.4, >= 1.4.3)
27
+ fixwhich (~> 1.0, >= 1.0.2)
28
+ threach (~> 0.2, >= 0.2.0)
29
+ trollop (~> 2.0)
30
+ docile (1.1.5)
31
+ domain_name (0.5.24)
32
+ unf (>= 0.0.5, < 1.0.0)
33
+ facade (1.0.6)
34
+ fix-trinity-output (1.0.0)
35
+ trollop (~> 2.0)
36
+ fixwhich (1.0.2)
37
+ pathname2 (~> 1.4, >= 1.4.4)
38
+ http-cookie (1.0.2)
39
+ domain_name (~> 0.5)
40
+ json (1.8.3)
41
+ mime-types (2.6.1)
42
+ minitest (4.7.5)
43
+ netrc (0.10.3)
44
+ pathname2 (1.7.3)
45
+ facade
46
+ rake (10.4.2)
47
+ rest-client (1.8.0)
48
+ http-cookie (>= 1.0.2, < 2.0)
49
+ mime-types (>= 1.16, < 3.0)
50
+ netrc (~> 0.7)
51
+ shoulda-context (1.2.1)
52
+ simplecov (0.10.0)
53
+ docile (~> 1.1.0)
54
+ json (~> 1.8)
55
+ simplecov-html (~> 0.10.0)
56
+ simplecov-html (0.10.0)
57
+ term-ansicolor (1.3.0)
58
+ tins (~> 1.0)
59
+ thor (0.19.1)
60
+ threach (0.2.0)
61
+ tins (1.5.2)
62
+ transrate (1.0.0.beta3)
63
+ bindeps (~> 1.1, >= 1.1.2)
64
+ bio (~> 1.4, >= 1.4.3)
65
+ crb-blast (~> 0.5, >= 0.5.0)
66
+ fix-trinity-output (~> 1.0, >= 1.0)
67
+ trollop (~> 2.0, >= 2.0.0)
68
+ yell (~> 2.0, >= 2.0.4)
69
+ trollop (2.1.1)
70
+ turn (0.9.7)
71
+ ansi
72
+ minitest (~> 4)
73
+ unf (0.1.4)
74
+ unf_ext
75
+ unf_ext (0.0.7.1)
76
+ yell (2.0.5)
77
+
78
+ PLATFORMS
79
+ ruby
80
+
81
+ DEPENDENCIES
82
+ coveralls (~> 0.7)
83
+ rake (~> 10.3, >= 10.3.2)
84
+ shoulda-context (~> 1.2, >= 1.2.1)
85
+ simplecov (~> 0.8, >= 0.8.2)
86
+ transfuse!
87
+ turn (~> 0.9, >= 0.9.7)
data/README.md ADDED
@@ -0,0 +1 @@
1
+ Transfuse
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.name = :corset
9
+ t.libs << 'test'
10
+ t.test_files = ['test/test_corset.rb']
11
+ end
12
+
13
+ Rake::TestTask.new do |t|
14
+ t.name = :cluster
15
+ t.libs << 'test'
16
+ t.test_files = ['test/test_cluster.rb']
17
+ end
18
+
19
+ desc "Run tests"
20
+ task :default => :test
data/bin/transfuse ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'trollop'
4
+ require 'transfuse'
5
+
6
+ ARGV[0] = "--help" if ARGV.length() == 0
7
+
8
+ opts = Trollop::options do
9
+ version Transfuse::VERSION::STRING.dup
10
+ banner <<-EOS
11
+
12
+ Transfuse v#{Transfuse::VERSION::STRING.dup}
13
+ by Chris Boursnell <cmb211@cam.ac.uk> and
14
+ Richard Smith-Unna <rds45@cam.ac.uk>
15
+
16
+ DESCRIPTION:
17
+ Merge multiple assemblies.
18
+
19
+ USAGE:
20
+ transfuse <options>
21
+
22
+ OPTIONS:
23
+
24
+ EOS
25
+ opt :assembly, "assembly files in FASTA format, comma-separated",
26
+ :type => String, :required => true
27
+ opt :scores, "transrate contig score output files, comma-separated",
28
+ :type => String
29
+ opt :left, "left reads file in FASTQ format",
30
+ :type => String
31
+ opt :right, "right reads file in FASTQ format",
32
+ :type => String
33
+ opt :output, "write merged assembly to file",
34
+ :type => String, :required => :true
35
+ opt :threads, "number of threads", :type => :int, :default => 1
36
+ opt :verbose, "be verbose"
37
+ end
38
+
39
+ transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
40
+
41
+ assembly_files = transfuse.check_files opts.assembly
42
+ score_files = transfuse.check_files opts.score if opts.score
43
+ left = transfuse.check_files opts.left if opts.left
44
+ right = transfuse.check_files opts.right if opts.right
45
+
46
+ if opts.scores
47
+ # load the scores from the comma separated list of files
48
+ scores = transfuse.load_scores score_files
49
+ elsif opts.left and opts.right
50
+ scores = transfuse.transrate assembly_files, left, right
51
+ else
52
+ msg = "Please provide either transrate contig scores as csv files or\n"
53
+ msg << "left and right fastq files to generate scores using transrate"
54
+ abort msg
55
+ end
56
+
57
+ assembly_files = transfuse.filter assembly_files, scores
58
+
59
+ # concatenate assemblies into one fasta file
60
+ cat = transfuse.concatenate assembly_files
61
+
62
+ # load fasta sequences from concatenated file into hash
63
+ transfuse.load_fasta cat
64
+
65
+ # cluster using vsearch or maybe cd-hit-est
66
+ clusters = transfuse.cluster cat
67
+
68
+ transfuse.sequence_alignment clusters
69
+ # pull out contigs from each cluster based on the scores
70
+ # best = transfuse.select_contigs clusters, scores
71
+
72
+ # transfuse.output_contigs best, cat, opts.output
73
+
data/deps/deps.yaml ADDED
File without changes
@@ -0,0 +1,113 @@
1
+ module Transfuse
2
+
3
+ require 'bio'
4
+ require 'fixwhich'
5
+
6
+ class Cluster
7
+
8
+ def initialize threads, verbose
9
+ @cdhit = Which::which('cd-hit-est').first
10
+ raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
11
+ @vsearch = Which::which('vsearch').first
12
+ raise "vsearch was not in the PATH - please install it" unless @vsearch
13
+ @id = "1.00"
14
+ @threads = threads
15
+ @verbose = verbose
16
+ end
17
+
18
+ def run fasta
19
+ use_cd_hit = false
20
+ if use_cd_hit
21
+ output = cd_hit fasta
22
+ return parse_output output
23
+ else
24
+ output = vsearch fasta
25
+ return parse_vsearch_output output
26
+ end
27
+ end
28
+
29
+ def cd_hit fasta
30
+ puts "running cd-hit-est" if @verbose
31
+ output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
32
+ cdhit_cmd = generate_cdhit_command fasta, output
33
+ puts cdhit_cmd if @verbose
34
+ cluster = Cmd.new cdhit_cmd
35
+ cluster.run output
36
+ return "#{output}.clstr"
37
+ end
38
+
39
+ def vsearch fasta
40
+ puts "running vsearch" if @verbose
41
+ cluster_output = "#{fasta}.clust"
42
+ vsearch_cmd = generate_vsearch_command fasta, cluster_output
43
+ cluster = Cmd.new vsearch_cmd
44
+ cluster.run cluster_output
45
+ return cluster_output
46
+ end
47
+
48
+ def generate_cdhit_command fasta, out
49
+ #cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
50
+ cmd = "#{@cdhit}"
51
+ cmd << " -i #{fasta}"
52
+ cmd << " -o #{out}"
53
+ cmd << " -c #{@id}" # similarity = number of identical bases /
54
+ # length of shorter sequences
55
+ cmd << " -T #{@threads}"
56
+ cmd << " -n 10" # word length - maybe increase??
57
+ cmd << " -d 100" # output name width
58
+ cmd << " -g 1" # slower but more accurate mode
59
+ cmd << " -M 8000" # increase memory
60
+ end
61
+
62
+ def generate_vsearch_command fasta, out
63
+ vsearch = "#{@vsearch}"
64
+ vsearch << " --cluster_fast #{fasta}"
65
+ vsearch << " --id #{@id}"
66
+ vsearch << " --iddef 0" # cd-hit definition of sequence id
67
+ vsearch << " --qmask none" # no masking
68
+ vsearch << " --strand both"
69
+ vsearch << " --uc #{out}"
70
+ vsearch << " --threads #{@threads}"
71
+ return vsearch
72
+ end
73
+
74
+ def parse_output cluster_output
75
+ puts "parsing cd-hit output #{cluster_output}" if @verbose
76
+ cluster_id = 0
77
+ clusters = {}
78
+ File.open(cluster_output).each_line do |line|
79
+ if line =~ />Cluster\ ([0-9]+)/
80
+ cluster_id = $1.to_i
81
+ elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
82
+ contig_name = $1
83
+ strand = $2
84
+ id = $3.to_f
85
+ clusters[cluster_id] ||= []
86
+ clusters[cluster_id] << { :name => contig_name, :strand => strand }
87
+ elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
88
+ contig_name = $1
89
+ strand = "+"
90
+ clusters[cluster_id] ||= []
91
+ clusters[cluster_id] << { :name => contig_name, :strand => strand }
92
+ end
93
+ end
94
+ return clusters
95
+ end
96
+
97
+ def parse_vsearch_output cluster_output
98
+ clusters = {}
99
+ File.open(cluster_output).each_line do |line|
100
+ if line.start_with?("S") or line.start_with?("H")
101
+ cols = line.chomp.split("\t")
102
+ cluster = cols[1].to_i
103
+ contig_name = cols[8]
104
+ clusters[cluster] ||= []
105
+ clusters[cluster] << contig_name
106
+ end
107
+ end
108
+ return clusters
109
+ end
110
+
111
+ end
112
+
113
+ end
@@ -0,0 +1,38 @@
1
+ require 'open3'
2
+
3
+ module Transfuse
4
+
5
+ class Status
6
+ def success?
7
+ return true
8
+ end
9
+ end
10
+
11
+ class Cmd
12
+
13
+ attr_accessor :cmd, :stdout, :stderr, :status
14
+
15
+ def initialize cmd
16
+ @cmd = cmd
17
+ end
18
+
19
+ def run file=nil
20
+ unless file.nil?
21
+ if File.exist?(file)
22
+ @stdout = ""
23
+ @stderr = ""
24
+ @status = Status.new
25
+ return true
26
+ end
27
+ end
28
+ @stdout, @stderr, @status = Open3.capture3 @cmd
29
+ return false
30
+ end
31
+
32
+ def to_s
33
+ @cmd
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,201 @@
1
+ class String
2
+ def revcomp
3
+ self.tr("ACGT", "TGCA").reverse
4
+ end
5
+ end
6
+
7
+ module Transfuse
8
+
9
+ require 'csv'
10
+ require 'transrate'
11
+
12
+ class Transfuse
13
+
14
+ def initialize threads, verbose
15
+ @threads = threads
16
+ @verbose = verbose
17
+ @clustalo = Which::which('clustalo').first
18
+ raise "clustalo was not in the PATH - please install it" unless @clustalo
19
+ end
20
+
21
+ def check_files string
22
+ list = []
23
+ string.split(",").each do |file|
24
+ file = File.expand_path(file)
25
+ if File.exist?(file)
26
+ puts "#{file} exists" if @verbose
27
+ list << file
28
+ else
29
+ abort "#{file} not found"
30
+ end
31
+ end
32
+ return list
33
+ end
34
+
35
+ def concatenate assemblies
36
+ catted_fasta = "all-"
37
+ fasta = []
38
+ assemblies.each do |name|
39
+ fasta << File.basename(name, File.extname(name))[0..5]
40
+ end
41
+ catted_fasta << fasta.join("-")
42
+ catted_fasta << ".fa"
43
+ puts "concatenating assemblies into #{catted_fasta}" if @verbose
44
+ cmd = "cat "
45
+ assemblies.each do |file|
46
+ cmd << " #{file} "
47
+ end
48
+ cmd << " > #{catted_fasta}"
49
+ catter = Cmd.new cmd
50
+ catter.run catted_fasta
51
+ return File.expand_path(catted_fasta)
52
+ end
53
+
54
+ def cluster file
55
+ puts "clustering #{file}" if @verbose
56
+ cluster = Cluster.new @threads, @verbose
57
+ return cluster.run file
58
+ end
59
+
60
+ def load_fasta fasta
61
+ @sequences = {}
62
+ Bio::FastaFormat.open(fasta).each do |entry|
63
+ @sequences[entry.entry_id] = entry.seq.to_s
64
+ end
65
+ end
66
+
67
+ def sequence_alignment clusters
68
+ clusters.each do |id, list| # threach
69
+ if list.size > 5
70
+ seq = ""
71
+ list.each do |hash|
72
+ seq << ">#{hash[:name]}\n"
73
+ if hash[:strand] == "+"
74
+ seq << "#{@sequences[hash[:name]]}\n"
75
+ elsif hash[:strand] == "-"
76
+ seq << "#{@sequences[hash[:name]].revcomp}\n"
77
+ else
78
+ abort "Unknown strand #{hash[:strand]}"
79
+ end
80
+ end
81
+ cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
82
+ cmd << "--output-order tree-order"
83
+ align = Cmd.new cmd
84
+ align.run
85
+ File.open("cluster#{id}.fa", "wb") do |out|
86
+ out.write align.stdout
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ def load_scores files
93
+ scores = {}
94
+ files.each do |file|
95
+ CSV.foreach(file, :headers => true,
96
+ :header_converters => :symbol,
97
+ :converters => :all) do |row|
98
+ name = row[:contig_name]
99
+ score = row[:score]
100
+ scores[name] = score
101
+ end
102
+ end
103
+ return scores
104
+ end
105
+
106
+ def filter files, scores
107
+ filtered_files = []
108
+ files.each_with_index do |file, index|
109
+ new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
110
+ unless File.exist?(new_filename)
111
+ File.open(new_filename, "wb") do |out|
112
+ puts "opening #{file}..."
113
+ Bio::FastaFormat.open(file).each do |entry|
114
+ contig_name = entry.entry_id
115
+ contig_name = "contig#{index}_#{contig_name}"
116
+ if scores.key?(contig_name) and scores[contig_name] > 0.01
117
+ out.write ">#{contig_name}\n"
118
+ out.write "#{entry.seq}\n"
119
+ elsif !scores.key?(contig_name)
120
+ abort "Can't find '#{contig_name}' in scores"
121
+ end
122
+ end
123
+ end
124
+ end
125
+ filtered_files << File.expand_path(new_filename)
126
+ end
127
+ return filtered_files
128
+ end
129
+
130
+
131
+ def transrate files, left, right
132
+ scores = {}
133
+ scores_file = "scores.csv"
134
+ if File.exist?(scores_file)
135
+ puts "loading scores from file" if @verbose
136
+ File.open(scores_file).each do |line|
137
+ name, score = line.chomp.split("\t")
138
+ scores[name] = score.to_f
139
+ end
140
+ else
141
+ files.each_with_index do |fasta, index|
142
+ puts "transrate on #{fasta}" if @verbose
143
+ assembly = Transrate::Assembly.new(fasta)
144
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
145
+ transrater.read_metrics(left.join(','), right.join(','))
146
+ assembly.each do |name, contig|
147
+ name = "contig#{index}_#{name}"
148
+ scores[name] = contig.score
149
+ end
150
+ end
151
+ File.open(scores_file, "wb") do |out|
152
+ scores.each do |name, score|
153
+ out.write "#{name}\t#{score}\n"
154
+ end
155
+ end
156
+ end
157
+ return scores
158
+ end
159
+
160
+ def select_contigs clusters, scores
161
+ puts "selecting contigs" if @verbose
162
+ best = []
163
+ clusters.each do |cluster_id, list|
164
+ best_score = 0
165
+ best_contig = ""
166
+ list.each do |contig_name|
167
+ unless scores[contig_name]
168
+ abort "can't find #{contig_name} in scores hash\n"
169
+ end
170
+ if scores[contig_name] > best_score
171
+ best_score = scores[contig_name]
172
+ best_contig = contig_name
173
+ end
174
+ end
175
+ best << best_contig
176
+ end
177
+ return best
178
+ end
179
+
180
+ def output_contigs best, fasta, output
181
+ puts "writing contigs" if @verbose
182
+ # read in catted fasta sequences
183
+ sequences = {}
184
+ Bio::FastaFormat.open(fasta).each do |entry|
185
+ sequences[entry.entry_id] = entry.seq
186
+ end
187
+ File.open(output, "wb") do |out|
188
+ best.each do |contig_name|
189
+ if sequences.key?(contig_name)
190
+ out.write ">#{contig_name}\n"
191
+ out.write "#{sequences[contig_name]}\n"
192
+ else
193
+ puts "can't find #{contig_name} in #{fasta}"
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ end
200
+
201
+ end
@@ -0,0 +1,17 @@
1
+ module Transfuse
2
+
3
+ # Defines the version of this codebase.
4
+ #
5
+ # This module is used in help messages and in generating
6
+ # the Gem. Versions must be incremented in accordance with
7
+ # Semantic Versioning 2.0 (http://semver.org/).
8
+ module VERSION
9
+ MAJOR = 0
10
+ MINOR = 1
11
+ PATCH = 1
12
+ BUILD = nil
13
+
14
+ STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
15
+ end
16
+
17
+ end # Transfuse
data/lib/transfuse.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'transfuse/cluster.rb'
2
+ require 'transfuse/cmd.rb'
3
+ require 'transfuse/transfuse.rb'
4
+ require 'transfuse/version.rb'
data/notes.md ADDED
@@ -0,0 +1,3 @@
1
+ ## Transfuse notes
2
+
3
+ Musings and mutterings about things we are doing in this project