transfuse 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9301f6cf6f1e24b506789da98e79c1b87cc4cfb7
4
+ data.tar.gz: 066ee9226775d9492a28291d5ee6b16fe79e61e3
5
+ SHA512:
6
+ metadata.gz: 48eb4011a41c92936752b8e7ec1aa53ab12a2737d62b6c0b733f09edf35aebe17e63562513a8a3a60ce8a0cef17201150d8c8432d649f2f8b498eb2f1b9822d3
7
+ data.tar.gz: 2339858d5e0e19184cbf9d259506ab7efd5d786fcb5ca95ffa62db13c481efc08d24fea58b47a6637391a1b7a45740074a76961609c2f3164761621657785d25
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ # directories
2
+
3
+ coverage
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,87 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ transfuse (0.1.1)
5
+ bindeps (~> 1.0, >= 1.0.1)
6
+ bio (~> 1.4, >= 1.4.3)
7
+ fixwhich (~> 1.0, >= 1.0.2)
8
+ transrate (= 1.0.0.beta3)
9
+ trollop (~> 2.0)
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ ansi (1.5.0)
15
+ bindeps (1.1.2)
16
+ fixwhich (~> 1.0, >= 1.0.2)
17
+ bio (1.4.3.0001)
18
+ coveralls (0.8.1)
19
+ json (~> 1.8)
20
+ rest-client (>= 1.6.8, < 2)
21
+ simplecov (~> 0.10.0)
22
+ term-ansicolor (~> 1.3)
23
+ thor (~> 0.19.1)
24
+ crb-blast (0.6.4)
25
+ bindeps (~> 1.0, >= 1.0.3)
26
+ bio (~> 1.4, >= 1.4.3)
27
+ fixwhich (~> 1.0, >= 1.0.2)
28
+ threach (~> 0.2, >= 0.2.0)
29
+ trollop (~> 2.0)
30
+ docile (1.1.5)
31
+ domain_name (0.5.24)
32
+ unf (>= 0.0.5, < 1.0.0)
33
+ facade (1.0.6)
34
+ fix-trinity-output (1.0.0)
35
+ trollop (~> 2.0)
36
+ fixwhich (1.0.2)
37
+ pathname2 (~> 1.4, >= 1.4.4)
38
+ http-cookie (1.0.2)
39
+ domain_name (~> 0.5)
40
+ json (1.8.3)
41
+ mime-types (2.6.1)
42
+ minitest (4.7.5)
43
+ netrc (0.10.3)
44
+ pathname2 (1.7.3)
45
+ facade
46
+ rake (10.4.2)
47
+ rest-client (1.8.0)
48
+ http-cookie (>= 1.0.2, < 2.0)
49
+ mime-types (>= 1.16, < 3.0)
50
+ netrc (~> 0.7)
51
+ shoulda-context (1.2.1)
52
+ simplecov (0.10.0)
53
+ docile (~> 1.1.0)
54
+ json (~> 1.8)
55
+ simplecov-html (~> 0.10.0)
56
+ simplecov-html (0.10.0)
57
+ term-ansicolor (1.3.0)
58
+ tins (~> 1.0)
59
+ thor (0.19.1)
60
+ threach (0.2.0)
61
+ tins (1.5.2)
62
+ transrate (1.0.0.beta3)
63
+ bindeps (~> 1.1, >= 1.1.2)
64
+ bio (~> 1.4, >= 1.4.3)
65
+ crb-blast (~> 0.5, >= 0.5.0)
66
+ fix-trinity-output (~> 1.0, >= 1.0)
67
+ trollop (~> 2.0, >= 2.0.0)
68
+ yell (~> 2.0, >= 2.0.4)
69
+ trollop (2.1.1)
70
+ turn (0.9.7)
71
+ ansi
72
+ minitest (~> 4)
73
+ unf (0.1.4)
74
+ unf_ext
75
+ unf_ext (0.0.7.1)
76
+ yell (2.0.5)
77
+
78
+ PLATFORMS
79
+ ruby
80
+
81
+ DEPENDENCIES
82
+ coveralls (~> 0.7)
83
+ rake (~> 10.3, >= 10.3.2)
84
+ shoulda-context (~> 1.2, >= 1.2.1)
85
+ simplecov (~> 0.8, >= 0.8.2)
86
+ transfuse!
87
+ turn (~> 0.9, >= 0.9.7)
data/README.md ADDED
@@ -0,0 +1 @@
1
+ Transfuse
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.name = :corset
9
+ t.libs << 'test'
10
+ t.test_files = ['test/test_corset.rb']
11
+ end
12
+
13
+ Rake::TestTask.new do |t|
14
+ t.name = :cluster
15
+ t.libs << 'test'
16
+ t.test_files = ['test/test_cluster.rb']
17
+ end
18
+
19
+ desc "Run tests"
20
+ task :default => :test
data/bin/transfuse ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'trollop'
4
+ require 'transfuse'
5
+
6
+ ARGV[0] = "--help" if ARGV.length() == 0
7
+
8
+ opts = Trollop::options do
9
+ version Transfuse::VERSION::STRING.dup
10
+ banner <<-EOS
11
+
12
+ Transfuse v#{Transfuse::VERSION::STRING.dup}
13
+ by Chris Boursnell <cmb211@cam.ac.uk> and
14
+ Richard Smith-Unna <rds45@cam.ac.uk>
15
+
16
+ DESCRIPTION:
17
+ Merge multiple assemblies.
18
+
19
+ USAGE:
20
+ transfuse <options>
21
+
22
+ OPTIONS:
23
+
24
+ EOS
25
+ opt :assembly, "assembly files in FASTA format, comma-separated",
26
+ :type => String, :required => true
27
+ opt :scores, "transrate contig score output files, comma-separated",
28
+ :type => String
29
+ opt :left, "left reads file in FASTQ format",
30
+ :type => String
31
+ opt :right, "right reads file in FASTQ format",
32
+ :type => String
33
+ opt :output, "write merged assembly to file",
34
+ :type => String, :required => :true
35
+ opt :threads, "number of threads", :type => :int, :default => 1
36
+ opt :verbose, "be verbose"
37
+ end
38
+
39
+ transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
40
+
41
+ assembly_files = transfuse.check_files opts.assembly
42
+ score_files = transfuse.check_files opts.score if opts.score
43
+ left = transfuse.check_files opts.left if opts.left
44
+ right = transfuse.check_files opts.right if opts.right
45
+
46
+ if opts.scores
47
+ # load the scores from the comma separated list of files
48
+ scores = transfuse.load_scores score_files
49
+ elsif opts.left and opts.right
50
+ scores = transfuse.transrate assembly_files, left, right
51
+ else
52
+ msg = "Please provide either transrate contig scores as csv files or\n"
53
+ msg << "left and right fastq files to generate scores using transrate"
54
+ abort msg
55
+ end
56
+
57
+ assembly_files = transfuse.filter assembly_files, scores
58
+
59
+ # concatenate assemblies into one fasta file
60
+ cat = transfuse.concatenate assembly_files
61
+
62
+ # load fasta sequences from concatenated file into hash
63
+ transfuse.load_fasta cat
64
+
65
+ # cluster using vsearch or maybe cd-hit-est
66
+ clusters = transfuse.cluster cat
67
+
68
+ transfuse.sequence_alignment clusters
69
+ # pull out contigs from each cluster based on the scores
70
+ # best = transfuse.select_contigs clusters, scores
71
+
72
+ # transfuse.output_contigs best, cat, opts.output
73
+
data/deps/deps.yaml ADDED
File without changes
@@ -0,0 +1,113 @@
1
+ module Transfuse
2
+
3
+ require 'bio'
4
+ require 'fixwhich'
5
+
6
+ class Cluster
7
+
8
+ def initialize threads, verbose
9
+ @cdhit = Which::which('cd-hit-est').first
10
+ raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
11
+ @vsearch = Which::which('vsearch').first
12
+ raise "vsearch was not in the PATH - please install it" unless @vsearch
13
+ @id = "1.00"
14
+ @threads = threads
15
+ @verbose = verbose
16
+ end
17
+
18
+ def run fasta
19
+ use_cd_hit = false
20
+ if use_cd_hit
21
+ output = cd_hit fasta
22
+ return parse_output output
23
+ else
24
+ output = vsearch fasta
25
+ return parse_vsearch_output output
26
+ end
27
+ end
28
+
29
+ def cd_hit fasta
30
+ puts "running cd-hit-est" if @verbose
31
+ output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
32
+ cdhit_cmd = generate_cdhit_command fasta, output
33
+ puts cdhit_cmd if @verbose
34
+ cluster = Cmd.new cdhit_cmd
35
+ cluster.run output
36
+ return "#{output}.clstr"
37
+ end
38
+
39
+ def vsearch fasta
40
+ puts "running vsearch" if @verbose
41
+ cluster_output = "#{fasta}.clust"
42
+ vsearch_cmd = generate_vsearch_command fasta, cluster_output
43
+ cluster = Cmd.new vsearch_cmd
44
+ cluster.run cluster_output
45
+ return cluster_output
46
+ end
47
+
48
+ def generate_cdhit_command fasta, out
49
+ #cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
50
+ cmd = "#{@cdhit}"
51
+ cmd << " -i #{fasta}"
52
+ cmd << " -o #{out}"
53
+ cmd << " -c #{@id}" # similarity = number of identical bases /
54
+ # length of shorter sequences
55
+ cmd << " -T #{@threads}"
56
+ cmd << " -n 10" # word length - maybe increase??
57
+ cmd << " -d 100" # output name width
58
+ cmd << " -g 1" # slower but more accurate mode
59
+ cmd << " -M 8000" # increase memory
60
+ end
61
+
62
+ def generate_vsearch_command fasta, out
63
+ vsearch = "#{@vsearch}"
64
+ vsearch << " --cluster_fast #{fasta}"
65
+ vsearch << " --id #{@id}"
66
+ vsearch << " --iddef 0" # cd-hit definition of sequence id
67
+ vsearch << " --qmask none" # no masking
68
+ vsearch << " --strand both"
69
+ vsearch << " --uc #{out}"
70
+ vsearch << " --threads #{@threads}"
71
+ return vsearch
72
+ end
73
+
74
+ def parse_output cluster_output
75
+ puts "parsing cd-hit output #{cluster_output}" if @verbose
76
+ cluster_id = 0
77
+ clusters = {}
78
+ File.open(cluster_output).each_line do |line|
79
+ if line =~ />Cluster\ ([0-9]+)/
80
+ cluster_id = $1.to_i
81
+ elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
82
+ contig_name = $1
83
+ strand = $2
84
+ id = $3.to_f
85
+ clusters[cluster_id] ||= []
86
+ clusters[cluster_id] << { :name => contig_name, :strand => strand }
87
+ elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
88
+ contig_name = $1
89
+ strand = "+"
90
+ clusters[cluster_id] ||= []
91
+ clusters[cluster_id] << { :name => contig_name, :strand => strand }
92
+ end
93
+ end
94
+ return clusters
95
+ end
96
+
97
+ def parse_vsearch_output cluster_output
98
+ clusters = {}
99
+ File.open(cluster_output).each_line do |line|
100
+ if line.start_with?("S") or line.start_with?("H")
101
+ cols = line.chomp.split("\t")
102
+ cluster = cols[1].to_i
103
+ contig_name = cols[8]
104
+ clusters[cluster] ||= []
105
+ clusters[cluster] << contig_name
106
+ end
107
+ end
108
+ return clusters
109
+ end
110
+
111
+ end
112
+
113
+ end
@@ -0,0 +1,38 @@
1
+ require 'open3'
2
+
3
+ module Transfuse
4
+
5
+ class Status
6
+ def success?
7
+ return true
8
+ end
9
+ end
10
+
11
+ class Cmd
12
+
13
+ attr_accessor :cmd, :stdout, :stderr, :status
14
+
15
+ def initialize cmd
16
+ @cmd = cmd
17
+ end
18
+
19
+ def run file=nil
20
+ unless file.nil?
21
+ if File.exist?(file)
22
+ @stdout = ""
23
+ @stderr = ""
24
+ @status = Status.new
25
+ return true
26
+ end
27
+ end
28
+ @stdout, @stderr, @status = Open3.capture3 @cmd
29
+ return false
30
+ end
31
+
32
+ def to_s
33
+ @cmd
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,201 @@
1
+ class String
2
+ def revcomp
3
+ self.tr("ACGT", "TGCA").reverse
4
+ end
5
+ end
6
+
7
+ module Transfuse
8
+
9
+ require 'csv'
10
+ require 'transrate'
11
+
12
+ class Transfuse
13
+
14
+ def initialize threads, verbose
15
+ @threads = threads
16
+ @verbose = verbose
17
+ @clustalo = Which::which('clustalo').first
18
+ raise "clustalo was not in the PATH - please install it" unless @clustalo
19
+ end
20
+
21
+ def check_files string
22
+ list = []
23
+ string.split(",").each do |file|
24
+ file = File.expand_path(file)
25
+ if File.exist?(file)
26
+ puts "#{file} exists" if @verbose
27
+ list << file
28
+ else
29
+ abort "#{file} not found"
30
+ end
31
+ end
32
+ return list
33
+ end
34
+
35
+ def concatenate assemblies
36
+ catted_fasta = "all-"
37
+ fasta = []
38
+ assemblies.each do |name|
39
+ fasta << File.basename(name, File.extname(name))[0..5]
40
+ end
41
+ catted_fasta << fasta.join("-")
42
+ catted_fasta << ".fa"
43
+ puts "concatenating assemblies into #{catted_fasta}" if @verbose
44
+ cmd = "cat "
45
+ assemblies.each do |file|
46
+ cmd << " #{file} "
47
+ end
48
+ cmd << " > #{catted_fasta}"
49
+ catter = Cmd.new cmd
50
+ catter.run catted_fasta
51
+ return File.expand_path(catted_fasta)
52
+ end
53
+
54
+ def cluster file
55
+ puts "clustering #{file}" if @verbose
56
+ cluster = Cluster.new @threads, @verbose
57
+ return cluster.run file
58
+ end
59
+
60
+ def load_fasta fasta
61
+ @sequences = {}
62
+ Bio::FastaFormat.open(fasta).each do |entry|
63
+ @sequences[entry.entry_id] = entry.seq.to_s
64
+ end
65
+ end
66
+
67
+ def sequence_alignment clusters
68
+ clusters.each do |id, list| # threach
69
+ if list.size > 5
70
+ seq = ""
71
+ list.each do |hash|
72
+ seq << ">#{hash[:name]}\n"
73
+ if hash[:strand] == "+"
74
+ seq << "#{@sequences[hash[:name]]}\n"
75
+ elsif hash[:strand] == "-"
76
+ seq << "#{@sequences[hash[:name]].revcomp}\n"
77
+ else
78
+ abort "Unknown strand #{hash[:strand]}"
79
+ end
80
+ end
81
+ cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
82
+ cmd << "--output-order tree-order"
83
+ align = Cmd.new cmd
84
+ align.run
85
+ File.open("cluster#{id}.fa", "wb") do |out|
86
+ out.write align.stdout
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ def load_scores files
93
+ scores = {}
94
+ files.each do |file|
95
+ CSV.foreach(file, :headers => true,
96
+ :header_converters => :symbol,
97
+ :converters => :all) do |row|
98
+ name = row[:contig_name]
99
+ score = row[:score]
100
+ scores[name] = score
101
+ end
102
+ end
103
+ return scores
104
+ end
105
+
106
+ def filter files, scores
107
+ filtered_files = []
108
+ files.each_with_index do |file, index|
109
+ new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
110
+ unless File.exist?(new_filename)
111
+ File.open(new_filename, "wb") do |out|
112
+ puts "opening #{file}..."
113
+ Bio::FastaFormat.open(file).each do |entry|
114
+ contig_name = entry.entry_id
115
+ contig_name = "contig#{index}_#{contig_name}"
116
+ if scores.key?(contig_name) and scores[contig_name] > 0.01
117
+ out.write ">#{contig_name}\n"
118
+ out.write "#{entry.seq}\n"
119
+ elsif !scores.key?(contig_name)
120
+ abort "Can't find '#{contig_name}' in scores"
121
+ end
122
+ end
123
+ end
124
+ end
125
+ filtered_files << File.expand_path(new_filename)
126
+ end
127
+ return filtered_files
128
+ end
129
+
130
+
131
+ def transrate files, left, right
132
+ scores = {}
133
+ scores_file = "scores.csv"
134
+ if File.exist?(scores_file)
135
+ puts "loading scores from file" if @verbose
136
+ File.open(scores_file).each do |line|
137
+ name, score = line.chomp.split("\t")
138
+ scores[name] = score.to_f
139
+ end
140
+ else
141
+ files.each_with_index do |fasta, index|
142
+ puts "transrate on #{fasta}" if @verbose
143
+ assembly = Transrate::Assembly.new(fasta)
144
+ transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
145
+ transrater.read_metrics(left.join(','), right.join(','))
146
+ assembly.each do |name, contig|
147
+ name = "contig#{index}_#{name}"
148
+ scores[name] = contig.score
149
+ end
150
+ end
151
+ File.open(scores_file, "wb") do |out|
152
+ scores.each do |name, score|
153
+ out.write "#{name}\t#{score}\n"
154
+ end
155
+ end
156
+ end
157
+ return scores
158
+ end
159
+
160
+ def select_contigs clusters, scores
161
+ puts "selecting contigs" if @verbose
162
+ best = []
163
+ clusters.each do |cluster_id, list|
164
+ best_score = 0
165
+ best_contig = ""
166
+ list.each do |contig_name|
167
+ unless scores[contig_name]
168
+ abort "can't find #{contig_name} in scores hash\n"
169
+ end
170
+ if scores[contig_name] > best_score
171
+ best_score = scores[contig_name]
172
+ best_contig = contig_name
173
+ end
174
+ end
175
+ best << best_contig
176
+ end
177
+ return best
178
+ end
179
+
180
+ def output_contigs best, fasta, output
181
+ puts "writing contigs" if @verbose
182
+ # read in catted fasta sequences
183
+ sequences = {}
184
+ Bio::FastaFormat.open(fasta).each do |entry|
185
+ sequences[entry.entry_id] = entry.seq
186
+ end
187
+ File.open(output, "wb") do |out|
188
+ best.each do |contig_name|
189
+ if sequences.key?(contig_name)
190
+ out.write ">#{contig_name}\n"
191
+ out.write "#{sequences[contig_name]}\n"
192
+ else
193
+ puts "can't find #{contig_name} in #{fasta}"
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ end
200
+
201
+ end
@@ -0,0 +1,17 @@
1
+ module Transfuse
2
+
3
+ # Defines the version of this codebase.
4
+ #
5
+ # This module is used in help messages and in generating
6
+ # the Gem. Versions must be incremented in accordance with
7
+ # Semantic Versioning 2.0 (http://semver.org/).
8
+ module VERSION
9
+ MAJOR = 0
10
+ MINOR = 1
11
+ PATCH = 1
12
+ BUILD = nil
13
+
14
+ STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
15
+ end
16
+
17
+ end # Transfuse
data/lib/transfuse.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'transfuse/cluster.rb'
2
+ require 'transfuse/cmd.rb'
3
+ require 'transfuse/transfuse.rb'
4
+ require 'transfuse/version.rb'
data/notes.md ADDED
@@ -0,0 +1,3 @@
1
+ ## Transfuse notes
2
+
3
+ Musings and mutterings about things we are doing in this project