transfuse 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +87 -0
- data/README.md +1 -0
- data/Rakefile +20 -0
- data/bin/transfuse +73 -0
- data/deps/deps.yaml +0 -0
- data/lib/transfuse/cluster.rb +113 -0
- data/lib/transfuse/cmd.rb +38 -0
- data/lib/transfuse/transfuse.rb +201 -0
- data/lib/transfuse/version.rb +17 -0
- data/lib/transfuse.rb +4 -0
- data/notes.md +3 -0
- data/test/data/assembly1.fasta +500 -0
- data/test/data/assembly2.fasta +500 -0
- data/test/data/contig_scores1.csv +100 -0
- data/test/helper.rb +16 -0
- data/test/test_cluster.rb +36 -0
- data/test/test_transfuse.rb +107 -0
- data/transfuse.gemspec +29 -0
- metadata +249 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9301f6cf6f1e24b506789da98e79c1b87cc4cfb7
|
4
|
+
data.tar.gz: 066ee9226775d9492a28291d5ee6b16fe79e61e3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 48eb4011a41c92936752b8e7ec1aa53ab12a2737d62b6c0b733f09edf35aebe17e63562513a8a3a60ce8a0cef17201150d8c8432d649f2f8b498eb2f1b9822d3
|
7
|
+
data.tar.gz: 2339858d5e0e19184cbf9d259506ab7efd5d786fcb5ca95ffa62db13c481efc08d24fea58b47a6637391a1b7a45740074a76961609c2f3164761621657785d25
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
transfuse (0.1.1)
|
5
|
+
bindeps (~> 1.0, >= 1.0.1)
|
6
|
+
bio (~> 1.4, >= 1.4.3)
|
7
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
8
|
+
transrate (= 1.0.0.beta3)
|
9
|
+
trollop (~> 2.0)
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: https://rubygems.org/
|
13
|
+
specs:
|
14
|
+
ansi (1.5.0)
|
15
|
+
bindeps (1.1.2)
|
16
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
17
|
+
bio (1.4.3.0001)
|
18
|
+
coveralls (0.8.1)
|
19
|
+
json (~> 1.8)
|
20
|
+
rest-client (>= 1.6.8, < 2)
|
21
|
+
simplecov (~> 0.10.0)
|
22
|
+
term-ansicolor (~> 1.3)
|
23
|
+
thor (~> 0.19.1)
|
24
|
+
crb-blast (0.6.4)
|
25
|
+
bindeps (~> 1.0, >= 1.0.3)
|
26
|
+
bio (~> 1.4, >= 1.4.3)
|
27
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
28
|
+
threach (~> 0.2, >= 0.2.0)
|
29
|
+
trollop (~> 2.0)
|
30
|
+
docile (1.1.5)
|
31
|
+
domain_name (0.5.24)
|
32
|
+
unf (>= 0.0.5, < 1.0.0)
|
33
|
+
facade (1.0.6)
|
34
|
+
fix-trinity-output (1.0.0)
|
35
|
+
trollop (~> 2.0)
|
36
|
+
fixwhich (1.0.2)
|
37
|
+
pathname2 (~> 1.4, >= 1.4.4)
|
38
|
+
http-cookie (1.0.2)
|
39
|
+
domain_name (~> 0.5)
|
40
|
+
json (1.8.3)
|
41
|
+
mime-types (2.6.1)
|
42
|
+
minitest (4.7.5)
|
43
|
+
netrc (0.10.3)
|
44
|
+
pathname2 (1.7.3)
|
45
|
+
facade
|
46
|
+
rake (10.4.2)
|
47
|
+
rest-client (1.8.0)
|
48
|
+
http-cookie (>= 1.0.2, < 2.0)
|
49
|
+
mime-types (>= 1.16, < 3.0)
|
50
|
+
netrc (~> 0.7)
|
51
|
+
shoulda-context (1.2.1)
|
52
|
+
simplecov (0.10.0)
|
53
|
+
docile (~> 1.1.0)
|
54
|
+
json (~> 1.8)
|
55
|
+
simplecov-html (~> 0.10.0)
|
56
|
+
simplecov-html (0.10.0)
|
57
|
+
term-ansicolor (1.3.0)
|
58
|
+
tins (~> 1.0)
|
59
|
+
thor (0.19.1)
|
60
|
+
threach (0.2.0)
|
61
|
+
tins (1.5.2)
|
62
|
+
transrate (1.0.0.beta3)
|
63
|
+
bindeps (~> 1.1, >= 1.1.2)
|
64
|
+
bio (~> 1.4, >= 1.4.3)
|
65
|
+
crb-blast (~> 0.5, >= 0.5.0)
|
66
|
+
fix-trinity-output (~> 1.0, >= 1.0)
|
67
|
+
trollop (~> 2.0, >= 2.0.0)
|
68
|
+
yell (~> 2.0, >= 2.0.4)
|
69
|
+
trollop (2.1.1)
|
70
|
+
turn (0.9.7)
|
71
|
+
ansi
|
72
|
+
minitest (~> 4)
|
73
|
+
unf (0.1.4)
|
74
|
+
unf_ext
|
75
|
+
unf_ext (0.0.7.1)
|
76
|
+
yell (2.0.5)
|
77
|
+
|
78
|
+
PLATFORMS
|
79
|
+
ruby
|
80
|
+
|
81
|
+
DEPENDENCIES
|
82
|
+
coveralls (~> 0.7)
|
83
|
+
rake (~> 10.3, >= 10.3.2)
|
84
|
+
shoulda-context (~> 1.2, >= 1.2.1)
|
85
|
+
simplecov (~> 0.8, >= 0.8.2)
|
86
|
+
transfuse!
|
87
|
+
turn (~> 0.9, >= 0.9.7)
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Transfuse
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'test'
|
5
|
+
end
|
6
|
+
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.name = :corset
|
9
|
+
t.libs << 'test'
|
10
|
+
t.test_files = ['test/test_corset.rb']
|
11
|
+
end
|
12
|
+
|
13
|
+
Rake::TestTask.new do |t|
|
14
|
+
t.name = :cluster
|
15
|
+
t.libs << 'test'
|
16
|
+
t.test_files = ['test/test_cluster.rb']
|
17
|
+
end
|
18
|
+
|
19
|
+
desc "Run tests"
|
20
|
+
task :default => :test
|
data/bin/transfuse
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'transfuse'
|
5
|
+
|
6
|
+
ARGV[0] = "--help" if ARGV.length() == 0
|
7
|
+
|
8
|
+
opts = Trollop::options do
|
9
|
+
version Transfuse::VERSION::STRING.dup
|
10
|
+
banner <<-EOS
|
11
|
+
|
12
|
+
Transfuse v#{Transfuse::VERSION::STRING.dup}
|
13
|
+
by Chris Boursnell <cmb211@cam.ac.uk> and
|
14
|
+
Richard Smith-Unna <rds45@cam.ac.uk>
|
15
|
+
|
16
|
+
DESCRIPTION:
|
17
|
+
Merge multiple assemblies.
|
18
|
+
|
19
|
+
USAGE:
|
20
|
+
transfuse <options>
|
21
|
+
|
22
|
+
OPTIONS:
|
23
|
+
|
24
|
+
EOS
|
25
|
+
opt :assembly, "assembly files in FASTA format, comma-separated",
|
26
|
+
:type => String, :required => true
|
27
|
+
opt :scores, "transrate contig score output files, comma-separated",
|
28
|
+
:type => String
|
29
|
+
opt :left, "left reads file in FASTQ format",
|
30
|
+
:type => String
|
31
|
+
opt :right, "right reads file in FASTQ format",
|
32
|
+
:type => String
|
33
|
+
opt :output, "write merged assembly to file",
|
34
|
+
:type => String, :required => :true
|
35
|
+
opt :threads, "number of threads", :type => :int, :default => 1
|
36
|
+
opt :verbose, "be verbose"
|
37
|
+
end
|
38
|
+
|
39
|
+
transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
|
40
|
+
|
41
|
+
assembly_files = transfuse.check_files opts.assembly
|
42
|
+
score_files = transfuse.check_files opts.score if opts.score
|
43
|
+
left = transfuse.check_files opts.left if opts.left
|
44
|
+
right = transfuse.check_files opts.right if opts.right
|
45
|
+
|
46
|
+
if opts.scores
|
47
|
+
# load the scores from the comma separated list of files
|
48
|
+
scores = transfuse.load_scores score_files
|
49
|
+
elsif opts.left and opts.right
|
50
|
+
scores = transfuse.transrate assembly_files, left, right
|
51
|
+
else
|
52
|
+
msg = "Please provide either transrate contig scores as csv files or\n"
|
53
|
+
msg << "left and right fastq files to generate scores using transrate"
|
54
|
+
abort msg
|
55
|
+
end
|
56
|
+
|
57
|
+
assembly_files = transfuse.filter assembly_files, scores
|
58
|
+
|
59
|
+
# concatenate assemblies into one fasta file
|
60
|
+
cat = transfuse.concatenate assembly_files
|
61
|
+
|
62
|
+
# load fasta sequences from concatenated file into hash
|
63
|
+
transfuse.load_fasta cat
|
64
|
+
|
65
|
+
# cluster using vsearch or maybe cd-hit-est
|
66
|
+
clusters = transfuse.cluster cat
|
67
|
+
|
68
|
+
transfuse.sequence_alignment clusters
|
69
|
+
# pull out contigs from each cluster based on the scores
|
70
|
+
# best = transfuse.select_contigs clusters, scores
|
71
|
+
|
72
|
+
# transfuse.output_contigs best, cat, opts.output
|
73
|
+
|
data/deps/deps.yaml
ADDED
File without changes
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Transfuse
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'fixwhich'
|
5
|
+
|
6
|
+
class Cluster
|
7
|
+
|
8
|
+
def initialize threads, verbose
|
9
|
+
@cdhit = Which::which('cd-hit-est').first
|
10
|
+
raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
|
11
|
+
@vsearch = Which::which('vsearch').first
|
12
|
+
raise "vsearch was not in the PATH - please install it" unless @vsearch
|
13
|
+
@id = "1.00"
|
14
|
+
@threads = threads
|
15
|
+
@verbose = verbose
|
16
|
+
end
|
17
|
+
|
18
|
+
def run fasta
|
19
|
+
use_cd_hit = false
|
20
|
+
if use_cd_hit
|
21
|
+
output = cd_hit fasta
|
22
|
+
return parse_output output
|
23
|
+
else
|
24
|
+
output = vsearch fasta
|
25
|
+
return parse_vsearch_output output
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def cd_hit fasta
|
30
|
+
puts "running cd-hit-est" if @verbose
|
31
|
+
output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
|
32
|
+
cdhit_cmd = generate_cdhit_command fasta, output
|
33
|
+
puts cdhit_cmd if @verbose
|
34
|
+
cluster = Cmd.new cdhit_cmd
|
35
|
+
cluster.run output
|
36
|
+
return "#{output}.clstr"
|
37
|
+
end
|
38
|
+
|
39
|
+
def vsearch fasta
|
40
|
+
puts "running vsearch" if @verbose
|
41
|
+
cluster_output = "#{fasta}.clust"
|
42
|
+
vsearch_cmd = generate_vsearch_command fasta, cluster_output
|
43
|
+
cluster = Cmd.new vsearch_cmd
|
44
|
+
cluster.run cluster_output
|
45
|
+
return cluster_output
|
46
|
+
end
|
47
|
+
|
48
|
+
def generate_cdhit_command fasta, out
|
49
|
+
#cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
|
50
|
+
cmd = "#{@cdhit}"
|
51
|
+
cmd << " -i #{fasta}"
|
52
|
+
cmd << " -o #{out}"
|
53
|
+
cmd << " -c #{@id}" # similarity = number of identical bases /
|
54
|
+
# length of shorter sequences
|
55
|
+
cmd << " -T #{@threads}"
|
56
|
+
cmd << " -n 10" # word length - maybe increase??
|
57
|
+
cmd << " -d 100" # output name width
|
58
|
+
cmd << " -g 1" # slower but more accurate mode
|
59
|
+
cmd << " -M 8000" # increase memory
|
60
|
+
end
|
61
|
+
|
62
|
+
def generate_vsearch_command fasta, out
|
63
|
+
vsearch = "#{@vsearch}"
|
64
|
+
vsearch << " --cluster_fast #{fasta}"
|
65
|
+
vsearch << " --id #{@id}"
|
66
|
+
vsearch << " --iddef 0" # cd-hit definition of sequence id
|
67
|
+
vsearch << " --qmask none" # no masking
|
68
|
+
vsearch << " --strand both"
|
69
|
+
vsearch << " --uc #{out}"
|
70
|
+
vsearch << " --threads #{@threads}"
|
71
|
+
return vsearch
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse_output cluster_output
|
75
|
+
puts "parsing cd-hit output #{cluster_output}" if @verbose
|
76
|
+
cluster_id = 0
|
77
|
+
clusters = {}
|
78
|
+
File.open(cluster_output).each_line do |line|
|
79
|
+
if line =~ />Cluster\ ([0-9]+)/
|
80
|
+
cluster_id = $1.to_i
|
81
|
+
elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
|
82
|
+
contig_name = $1
|
83
|
+
strand = $2
|
84
|
+
id = $3.to_f
|
85
|
+
clusters[cluster_id] ||= []
|
86
|
+
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
87
|
+
elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
|
88
|
+
contig_name = $1
|
89
|
+
strand = "+"
|
90
|
+
clusters[cluster_id] ||= []
|
91
|
+
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return clusters
|
95
|
+
end
|
96
|
+
|
97
|
+
def parse_vsearch_output cluster_output
|
98
|
+
clusters = {}
|
99
|
+
File.open(cluster_output).each_line do |line|
|
100
|
+
if line.start_with?("S") or line.start_with?("H")
|
101
|
+
cols = line.chomp.split("\t")
|
102
|
+
cluster = cols[1].to_i
|
103
|
+
contig_name = cols[8]
|
104
|
+
clusters[cluster] ||= []
|
105
|
+
clusters[cluster] << contig_name
|
106
|
+
end
|
107
|
+
end
|
108
|
+
return clusters
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Transfuse
|
4
|
+
|
5
|
+
class Status
|
6
|
+
def success?
|
7
|
+
return true
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Cmd
|
12
|
+
|
13
|
+
attr_accessor :cmd, :stdout, :stderr, :status
|
14
|
+
|
15
|
+
def initialize cmd
|
16
|
+
@cmd = cmd
|
17
|
+
end
|
18
|
+
|
19
|
+
def run file=nil
|
20
|
+
unless file.nil?
|
21
|
+
if File.exist?(file)
|
22
|
+
@stdout = ""
|
23
|
+
@stderr = ""
|
24
|
+
@status = Status.new
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
@stdout, @stderr, @status = Open3.capture3 @cmd
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
@cmd
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
class String
|
2
|
+
def revcomp
|
3
|
+
self.tr("ACGT", "TGCA").reverse
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Transfuse
|
8
|
+
|
9
|
+
require 'csv'
|
10
|
+
require 'transrate'
|
11
|
+
|
12
|
+
class Transfuse
|
13
|
+
|
14
|
+
def initialize threads, verbose
|
15
|
+
@threads = threads
|
16
|
+
@verbose = verbose
|
17
|
+
@clustalo = Which::which('clustalo').first
|
18
|
+
raise "clustalo was not in the PATH - please install it" unless @clustalo
|
19
|
+
end
|
20
|
+
|
21
|
+
def check_files string
|
22
|
+
list = []
|
23
|
+
string.split(",").each do |file|
|
24
|
+
file = File.expand_path(file)
|
25
|
+
if File.exist?(file)
|
26
|
+
puts "#{file} exists" if @verbose
|
27
|
+
list << file
|
28
|
+
else
|
29
|
+
abort "#{file} not found"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return list
|
33
|
+
end
|
34
|
+
|
35
|
+
def concatenate assemblies
|
36
|
+
catted_fasta = "all-"
|
37
|
+
fasta = []
|
38
|
+
assemblies.each do |name|
|
39
|
+
fasta << File.basename(name, File.extname(name))[0..5]
|
40
|
+
end
|
41
|
+
catted_fasta << fasta.join("-")
|
42
|
+
catted_fasta << ".fa"
|
43
|
+
puts "concatenating assemblies into #{catted_fasta}" if @verbose
|
44
|
+
cmd = "cat "
|
45
|
+
assemblies.each do |file|
|
46
|
+
cmd << " #{file} "
|
47
|
+
end
|
48
|
+
cmd << " > #{catted_fasta}"
|
49
|
+
catter = Cmd.new cmd
|
50
|
+
catter.run catted_fasta
|
51
|
+
return File.expand_path(catted_fasta)
|
52
|
+
end
|
53
|
+
|
54
|
+
def cluster file
|
55
|
+
puts "clustering #{file}" if @verbose
|
56
|
+
cluster = Cluster.new @threads, @verbose
|
57
|
+
return cluster.run file
|
58
|
+
end
|
59
|
+
|
60
|
+
def load_fasta fasta
|
61
|
+
@sequences = {}
|
62
|
+
Bio::FastaFormat.open(fasta).each do |entry|
|
63
|
+
@sequences[entry.entry_id] = entry.seq.to_s
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def sequence_alignment clusters
|
68
|
+
clusters.each do |id, list| # threach
|
69
|
+
if list.size > 5
|
70
|
+
seq = ""
|
71
|
+
list.each do |hash|
|
72
|
+
seq << ">#{hash[:name]}\n"
|
73
|
+
if hash[:strand] == "+"
|
74
|
+
seq << "#{@sequences[hash[:name]]}\n"
|
75
|
+
elsif hash[:strand] == "-"
|
76
|
+
seq << "#{@sequences[hash[:name]].revcomp}\n"
|
77
|
+
else
|
78
|
+
abort "Unknown strand #{hash[:strand]}"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
|
82
|
+
cmd << "--output-order tree-order"
|
83
|
+
align = Cmd.new cmd
|
84
|
+
align.run
|
85
|
+
File.open("cluster#{id}.fa", "wb") do |out|
|
86
|
+
out.write align.stdout
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def load_scores files
|
93
|
+
scores = {}
|
94
|
+
files.each do |file|
|
95
|
+
CSV.foreach(file, :headers => true,
|
96
|
+
:header_converters => :symbol,
|
97
|
+
:converters => :all) do |row|
|
98
|
+
name = row[:contig_name]
|
99
|
+
score = row[:score]
|
100
|
+
scores[name] = score
|
101
|
+
end
|
102
|
+
end
|
103
|
+
return scores
|
104
|
+
end
|
105
|
+
|
106
|
+
def filter files, scores
|
107
|
+
filtered_files = []
|
108
|
+
files.each_with_index do |file, index|
|
109
|
+
new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
|
110
|
+
unless File.exist?(new_filename)
|
111
|
+
File.open(new_filename, "wb") do |out|
|
112
|
+
puts "opening #{file}..."
|
113
|
+
Bio::FastaFormat.open(file).each do |entry|
|
114
|
+
contig_name = entry.entry_id
|
115
|
+
contig_name = "contig#{index}_#{contig_name}"
|
116
|
+
if scores.key?(contig_name) and scores[contig_name] > 0.01
|
117
|
+
out.write ">#{contig_name}\n"
|
118
|
+
out.write "#{entry.seq}\n"
|
119
|
+
elsif !scores.key?(contig_name)
|
120
|
+
abort "Can't find '#{contig_name}' in scores"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
filtered_files << File.expand_path(new_filename)
|
126
|
+
end
|
127
|
+
return filtered_files
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
def transrate files, left, right
|
132
|
+
scores = {}
|
133
|
+
scores_file = "scores.csv"
|
134
|
+
if File.exist?(scores_file)
|
135
|
+
puts "loading scores from file" if @verbose
|
136
|
+
File.open(scores_file).each do |line|
|
137
|
+
name, score = line.chomp.split("\t")
|
138
|
+
scores[name] = score.to_f
|
139
|
+
end
|
140
|
+
else
|
141
|
+
files.each_with_index do |fasta, index|
|
142
|
+
puts "transrate on #{fasta}" if @verbose
|
143
|
+
assembly = Transrate::Assembly.new(fasta)
|
144
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
145
|
+
transrater.read_metrics(left.join(','), right.join(','))
|
146
|
+
assembly.each do |name, contig|
|
147
|
+
name = "contig#{index}_#{name}"
|
148
|
+
scores[name] = contig.score
|
149
|
+
end
|
150
|
+
end
|
151
|
+
File.open(scores_file, "wb") do |out|
|
152
|
+
scores.each do |name, score|
|
153
|
+
out.write "#{name}\t#{score}\n"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
return scores
|
158
|
+
end
|
159
|
+
|
160
|
+
def select_contigs clusters, scores
|
161
|
+
puts "selecting contigs" if @verbose
|
162
|
+
best = []
|
163
|
+
clusters.each do |cluster_id, list|
|
164
|
+
best_score = 0
|
165
|
+
best_contig = ""
|
166
|
+
list.each do |contig_name|
|
167
|
+
unless scores[contig_name]
|
168
|
+
abort "can't find #{contig_name} in scores hash\n"
|
169
|
+
end
|
170
|
+
if scores[contig_name] > best_score
|
171
|
+
best_score = scores[contig_name]
|
172
|
+
best_contig = contig_name
|
173
|
+
end
|
174
|
+
end
|
175
|
+
best << best_contig
|
176
|
+
end
|
177
|
+
return best
|
178
|
+
end
|
179
|
+
|
180
|
+
def output_contigs best, fasta, output
|
181
|
+
puts "writing contigs" if @verbose
|
182
|
+
# read in catted fasta sequences
|
183
|
+
sequences = {}
|
184
|
+
Bio::FastaFormat.open(fasta).each do |entry|
|
185
|
+
sequences[entry.entry_id] = entry.seq
|
186
|
+
end
|
187
|
+
File.open(output, "wb") do |out|
|
188
|
+
best.each do |contig_name|
|
189
|
+
if sequences.key?(contig_name)
|
190
|
+
out.write ">#{contig_name}\n"
|
191
|
+
out.write "#{sequences[contig_name]}\n"
|
192
|
+
else
|
193
|
+
puts "can't find #{contig_name} in #{fasta}"
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Transfuse
|
2
|
+
|
3
|
+
# Defines the version of this codebase.
|
4
|
+
#
|
5
|
+
# This module is used in help messages and in generating
|
6
|
+
# the Gem. Versions must be incremented in accordance with
|
7
|
+
# Semantic Versioning 2.0 (http://semver.org/).
|
8
|
+
module VERSION
|
9
|
+
MAJOR = 0
|
10
|
+
MINOR = 1
|
11
|
+
PATCH = 1
|
12
|
+
BUILD = nil
|
13
|
+
|
14
|
+
STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
15
|
+
end
|
16
|
+
|
17
|
+
end # Transfuse
|
data/lib/transfuse.rb
ADDED
data/notes.md
ADDED