transfuse 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +87 -0
- data/README.md +1 -0
- data/Rakefile +20 -0
- data/bin/transfuse +73 -0
- data/deps/deps.yaml +0 -0
- data/lib/transfuse/cluster.rb +113 -0
- data/lib/transfuse/cmd.rb +38 -0
- data/lib/transfuse/transfuse.rb +201 -0
- data/lib/transfuse/version.rb +17 -0
- data/lib/transfuse.rb +4 -0
- data/notes.md +3 -0
- data/test/data/assembly1.fasta +500 -0
- data/test/data/assembly2.fasta +500 -0
- data/test/data/contig_scores1.csv +100 -0
- data/test/helper.rb +16 -0
- data/test/test_cluster.rb +36 -0
- data/test/test_transfuse.rb +107 -0
- data/transfuse.gemspec +29 -0
- metadata +249 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9301f6cf6f1e24b506789da98e79c1b87cc4cfb7
|
4
|
+
data.tar.gz: 066ee9226775d9492a28291d5ee6b16fe79e61e3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 48eb4011a41c92936752b8e7ec1aa53ab12a2737d62b6c0b733f09edf35aebe17e63562513a8a3a60ce8a0cef17201150d8c8432d649f2f8b498eb2f1b9822d3
|
7
|
+
data.tar.gz: 2339858d5e0e19184cbf9d259506ab7efd5d786fcb5ca95ffa62db13c481efc08d24fea58b47a6637391a1b7a45740074a76961609c2f3164761621657785d25
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
transfuse (0.1.1)
|
5
|
+
bindeps (~> 1.0, >= 1.0.1)
|
6
|
+
bio (~> 1.4, >= 1.4.3)
|
7
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
8
|
+
transrate (= 1.0.0.beta3)
|
9
|
+
trollop (~> 2.0)
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: https://rubygems.org/
|
13
|
+
specs:
|
14
|
+
ansi (1.5.0)
|
15
|
+
bindeps (1.1.2)
|
16
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
17
|
+
bio (1.4.3.0001)
|
18
|
+
coveralls (0.8.1)
|
19
|
+
json (~> 1.8)
|
20
|
+
rest-client (>= 1.6.8, < 2)
|
21
|
+
simplecov (~> 0.10.0)
|
22
|
+
term-ansicolor (~> 1.3)
|
23
|
+
thor (~> 0.19.1)
|
24
|
+
crb-blast (0.6.4)
|
25
|
+
bindeps (~> 1.0, >= 1.0.3)
|
26
|
+
bio (~> 1.4, >= 1.4.3)
|
27
|
+
fixwhich (~> 1.0, >= 1.0.2)
|
28
|
+
threach (~> 0.2, >= 0.2.0)
|
29
|
+
trollop (~> 2.0)
|
30
|
+
docile (1.1.5)
|
31
|
+
domain_name (0.5.24)
|
32
|
+
unf (>= 0.0.5, < 1.0.0)
|
33
|
+
facade (1.0.6)
|
34
|
+
fix-trinity-output (1.0.0)
|
35
|
+
trollop (~> 2.0)
|
36
|
+
fixwhich (1.0.2)
|
37
|
+
pathname2 (~> 1.4, >= 1.4.4)
|
38
|
+
http-cookie (1.0.2)
|
39
|
+
domain_name (~> 0.5)
|
40
|
+
json (1.8.3)
|
41
|
+
mime-types (2.6.1)
|
42
|
+
minitest (4.7.5)
|
43
|
+
netrc (0.10.3)
|
44
|
+
pathname2 (1.7.3)
|
45
|
+
facade
|
46
|
+
rake (10.4.2)
|
47
|
+
rest-client (1.8.0)
|
48
|
+
http-cookie (>= 1.0.2, < 2.0)
|
49
|
+
mime-types (>= 1.16, < 3.0)
|
50
|
+
netrc (~> 0.7)
|
51
|
+
shoulda-context (1.2.1)
|
52
|
+
simplecov (0.10.0)
|
53
|
+
docile (~> 1.1.0)
|
54
|
+
json (~> 1.8)
|
55
|
+
simplecov-html (~> 0.10.0)
|
56
|
+
simplecov-html (0.10.0)
|
57
|
+
term-ansicolor (1.3.0)
|
58
|
+
tins (~> 1.0)
|
59
|
+
thor (0.19.1)
|
60
|
+
threach (0.2.0)
|
61
|
+
tins (1.5.2)
|
62
|
+
transrate (1.0.0.beta3)
|
63
|
+
bindeps (~> 1.1, >= 1.1.2)
|
64
|
+
bio (~> 1.4, >= 1.4.3)
|
65
|
+
crb-blast (~> 0.5, >= 0.5.0)
|
66
|
+
fix-trinity-output (~> 1.0, >= 1.0)
|
67
|
+
trollop (~> 2.0, >= 2.0.0)
|
68
|
+
yell (~> 2.0, >= 2.0.4)
|
69
|
+
trollop (2.1.1)
|
70
|
+
turn (0.9.7)
|
71
|
+
ansi
|
72
|
+
minitest (~> 4)
|
73
|
+
unf (0.1.4)
|
74
|
+
unf_ext
|
75
|
+
unf_ext (0.0.7.1)
|
76
|
+
yell (2.0.5)
|
77
|
+
|
78
|
+
PLATFORMS
|
79
|
+
ruby
|
80
|
+
|
81
|
+
DEPENDENCIES
|
82
|
+
coveralls (~> 0.7)
|
83
|
+
rake (~> 10.3, >= 10.3.2)
|
84
|
+
shoulda-context (~> 1.2, >= 1.2.1)
|
85
|
+
simplecov (~> 0.8, >= 0.8.2)
|
86
|
+
transfuse!
|
87
|
+
turn (~> 0.9, >= 0.9.7)
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Transfuse
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'test'
|
5
|
+
end
|
6
|
+
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.name = :corset
|
9
|
+
t.libs << 'test'
|
10
|
+
t.test_files = ['test/test_corset.rb']
|
11
|
+
end
|
12
|
+
|
13
|
+
Rake::TestTask.new do |t|
|
14
|
+
t.name = :cluster
|
15
|
+
t.libs << 'test'
|
16
|
+
t.test_files = ['test/test_cluster.rb']
|
17
|
+
end
|
18
|
+
|
19
|
+
desc "Run tests"
|
20
|
+
task :default => :test
|
data/bin/transfuse
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'transfuse'
|
5
|
+
|
6
|
+
ARGV[0] = "--help" if ARGV.length() == 0
|
7
|
+
|
8
|
+
opts = Trollop::options do
|
9
|
+
version Transfuse::VERSION::STRING.dup
|
10
|
+
banner <<-EOS
|
11
|
+
|
12
|
+
Transfuse v#{Transfuse::VERSION::STRING.dup}
|
13
|
+
by Chris Boursnell <cmb211@cam.ac.uk> and
|
14
|
+
Richard Smith-Unna <rds45@cam.ac.uk>
|
15
|
+
|
16
|
+
DESCRIPTION:
|
17
|
+
Merge multiple assemblies.
|
18
|
+
|
19
|
+
USAGE:
|
20
|
+
transfuse <options>
|
21
|
+
|
22
|
+
OPTIONS:
|
23
|
+
|
24
|
+
EOS
|
25
|
+
opt :assembly, "assembly files in FASTA format, comma-separated",
|
26
|
+
:type => String, :required => true
|
27
|
+
opt :scores, "transrate contig score output files, comma-separated",
|
28
|
+
:type => String
|
29
|
+
opt :left, "left reads file in FASTQ format",
|
30
|
+
:type => String
|
31
|
+
opt :right, "right reads file in FASTQ format",
|
32
|
+
:type => String
|
33
|
+
opt :output, "write merged assembly to file",
|
34
|
+
:type => String, :required => :true
|
35
|
+
opt :threads, "number of threads", :type => :int, :default => 1
|
36
|
+
opt :verbose, "be verbose"
|
37
|
+
end
|
38
|
+
|
39
|
+
transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
|
40
|
+
|
41
|
+
assembly_files = transfuse.check_files opts.assembly
|
42
|
+
score_files = transfuse.check_files opts.score if opts.score
|
43
|
+
left = transfuse.check_files opts.left if opts.left
|
44
|
+
right = transfuse.check_files opts.right if opts.right
|
45
|
+
|
46
|
+
if opts.scores
|
47
|
+
# load the scores from the comma separated list of files
|
48
|
+
scores = transfuse.load_scores score_files
|
49
|
+
elsif opts.left and opts.right
|
50
|
+
scores = transfuse.transrate assembly_files, left, right
|
51
|
+
else
|
52
|
+
msg = "Please provide either transrate contig scores as csv files or\n"
|
53
|
+
msg << "left and right fastq files to generate scores using transrate"
|
54
|
+
abort msg
|
55
|
+
end
|
56
|
+
|
57
|
+
assembly_files = transfuse.filter assembly_files, scores
|
58
|
+
|
59
|
+
# concatenate assemblies into one fasta file
|
60
|
+
cat = transfuse.concatenate assembly_files
|
61
|
+
|
62
|
+
# load fasta sequences from concatenated file into hash
|
63
|
+
transfuse.load_fasta cat
|
64
|
+
|
65
|
+
# cluster using vsearch or maybe cd-hit-est
|
66
|
+
clusters = transfuse.cluster cat
|
67
|
+
|
68
|
+
transfuse.sequence_alignment clusters
|
69
|
+
# pull out contigs from each cluster based on the scores
|
70
|
+
# best = transfuse.select_contigs clusters, scores
|
71
|
+
|
72
|
+
# transfuse.output_contigs best, cat, opts.output
|
73
|
+
|
data/deps/deps.yaml
ADDED
File without changes
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Transfuse
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'fixwhich'
|
5
|
+
|
6
|
+
class Cluster
|
7
|
+
|
8
|
+
def initialize threads, verbose
|
9
|
+
@cdhit = Which::which('cd-hit-est').first
|
10
|
+
raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
|
11
|
+
@vsearch = Which::which('vsearch').first
|
12
|
+
raise "vsearch was not in the PATH - please install it" unless @vsearch
|
13
|
+
@id = "1.00"
|
14
|
+
@threads = threads
|
15
|
+
@verbose = verbose
|
16
|
+
end
|
17
|
+
|
18
|
+
def run fasta
|
19
|
+
use_cd_hit = false
|
20
|
+
if use_cd_hit
|
21
|
+
output = cd_hit fasta
|
22
|
+
return parse_output output
|
23
|
+
else
|
24
|
+
output = vsearch fasta
|
25
|
+
return parse_vsearch_output output
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def cd_hit fasta
|
30
|
+
puts "running cd-hit-est" if @verbose
|
31
|
+
output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
|
32
|
+
cdhit_cmd = generate_cdhit_command fasta, output
|
33
|
+
puts cdhit_cmd if @verbose
|
34
|
+
cluster = Cmd.new cdhit_cmd
|
35
|
+
cluster.run output
|
36
|
+
return "#{output}.clstr"
|
37
|
+
end
|
38
|
+
|
39
|
+
def vsearch fasta
|
40
|
+
puts "running vsearch" if @verbose
|
41
|
+
cluster_output = "#{fasta}.clust"
|
42
|
+
vsearch_cmd = generate_vsearch_command fasta, cluster_output
|
43
|
+
cluster = Cmd.new vsearch_cmd
|
44
|
+
cluster.run cluster_output
|
45
|
+
return cluster_output
|
46
|
+
end
|
47
|
+
|
48
|
+
def generate_cdhit_command fasta, out
|
49
|
+
#cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
|
50
|
+
cmd = "#{@cdhit}"
|
51
|
+
cmd << " -i #{fasta}"
|
52
|
+
cmd << " -o #{out}"
|
53
|
+
cmd << " -c #{@id}" # similarity = number of identical bases /
|
54
|
+
# length of shorter sequences
|
55
|
+
cmd << " -T #{@threads}"
|
56
|
+
cmd << " -n 10" # word length - maybe increase??
|
57
|
+
cmd << " -d 100" # output name width
|
58
|
+
cmd << " -g 1" # slower but more accurate mode
|
59
|
+
cmd << " -M 8000" # increase memory
|
60
|
+
end
|
61
|
+
|
62
|
+
def generate_vsearch_command fasta, out
|
63
|
+
vsearch = "#{@vsearch}"
|
64
|
+
vsearch << " --cluster_fast #{fasta}"
|
65
|
+
vsearch << " --id #{@id}"
|
66
|
+
vsearch << " --iddef 0" # cd-hit definition of sequence id
|
67
|
+
vsearch << " --qmask none" # no masking
|
68
|
+
vsearch << " --strand both"
|
69
|
+
vsearch << " --uc #{out}"
|
70
|
+
vsearch << " --threads #{@threads}"
|
71
|
+
return vsearch
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse_output cluster_output
|
75
|
+
puts "parsing cd-hit output #{cluster_output}" if @verbose
|
76
|
+
cluster_id = 0
|
77
|
+
clusters = {}
|
78
|
+
File.open(cluster_output).each_line do |line|
|
79
|
+
if line =~ />Cluster\ ([0-9]+)/
|
80
|
+
cluster_id = $1.to_i
|
81
|
+
elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
|
82
|
+
contig_name = $1
|
83
|
+
strand = $2
|
84
|
+
id = $3.to_f
|
85
|
+
clusters[cluster_id] ||= []
|
86
|
+
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
87
|
+
elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
|
88
|
+
contig_name = $1
|
89
|
+
strand = "+"
|
90
|
+
clusters[cluster_id] ||= []
|
91
|
+
clusters[cluster_id] << { :name => contig_name, :strand => strand }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return clusters
|
95
|
+
end
|
96
|
+
|
97
|
+
def parse_vsearch_output cluster_output
|
98
|
+
clusters = {}
|
99
|
+
File.open(cluster_output).each_line do |line|
|
100
|
+
if line.start_with?("S") or line.start_with?("H")
|
101
|
+
cols = line.chomp.split("\t")
|
102
|
+
cluster = cols[1].to_i
|
103
|
+
contig_name = cols[8]
|
104
|
+
clusters[cluster] ||= []
|
105
|
+
clusters[cluster] << contig_name
|
106
|
+
end
|
107
|
+
end
|
108
|
+
return clusters
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Transfuse
|
4
|
+
|
5
|
+
class Status
|
6
|
+
def success?
|
7
|
+
return true
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Cmd
|
12
|
+
|
13
|
+
attr_accessor :cmd, :stdout, :stderr, :status
|
14
|
+
|
15
|
+
def initialize cmd
|
16
|
+
@cmd = cmd
|
17
|
+
end
|
18
|
+
|
19
|
+
def run file=nil
|
20
|
+
unless file.nil?
|
21
|
+
if File.exist?(file)
|
22
|
+
@stdout = ""
|
23
|
+
@stderr = ""
|
24
|
+
@status = Status.new
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
@stdout, @stderr, @status = Open3.capture3 @cmd
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
@cmd
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
class String
|
2
|
+
def revcomp
|
3
|
+
self.tr("ACGT", "TGCA").reverse
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Transfuse
|
8
|
+
|
9
|
+
require 'csv'
|
10
|
+
require 'transrate'
|
11
|
+
|
12
|
+
class Transfuse
|
13
|
+
|
14
|
+
def initialize threads, verbose
|
15
|
+
@threads = threads
|
16
|
+
@verbose = verbose
|
17
|
+
@clustalo = Which::which('clustalo').first
|
18
|
+
raise "clustalo was not in the PATH - please install it" unless @clustalo
|
19
|
+
end
|
20
|
+
|
21
|
+
def check_files string
|
22
|
+
list = []
|
23
|
+
string.split(",").each do |file|
|
24
|
+
file = File.expand_path(file)
|
25
|
+
if File.exist?(file)
|
26
|
+
puts "#{file} exists" if @verbose
|
27
|
+
list << file
|
28
|
+
else
|
29
|
+
abort "#{file} not found"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return list
|
33
|
+
end
|
34
|
+
|
35
|
+
def concatenate assemblies
|
36
|
+
catted_fasta = "all-"
|
37
|
+
fasta = []
|
38
|
+
assemblies.each do |name|
|
39
|
+
fasta << File.basename(name, File.extname(name))[0..5]
|
40
|
+
end
|
41
|
+
catted_fasta << fasta.join("-")
|
42
|
+
catted_fasta << ".fa"
|
43
|
+
puts "concatenating assemblies into #{catted_fasta}" if @verbose
|
44
|
+
cmd = "cat "
|
45
|
+
assemblies.each do |file|
|
46
|
+
cmd << " #{file} "
|
47
|
+
end
|
48
|
+
cmd << " > #{catted_fasta}"
|
49
|
+
catter = Cmd.new cmd
|
50
|
+
catter.run catted_fasta
|
51
|
+
return File.expand_path(catted_fasta)
|
52
|
+
end
|
53
|
+
|
54
|
+
def cluster file
|
55
|
+
puts "clustering #{file}" if @verbose
|
56
|
+
cluster = Cluster.new @threads, @verbose
|
57
|
+
return cluster.run file
|
58
|
+
end
|
59
|
+
|
60
|
+
def load_fasta fasta
|
61
|
+
@sequences = {}
|
62
|
+
Bio::FastaFormat.open(fasta).each do |entry|
|
63
|
+
@sequences[entry.entry_id] = entry.seq.to_s
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def sequence_alignment clusters
|
68
|
+
clusters.each do |id, list| # threach
|
69
|
+
if list.size > 5
|
70
|
+
seq = ""
|
71
|
+
list.each do |hash|
|
72
|
+
seq << ">#{hash[:name]}\n"
|
73
|
+
if hash[:strand] == "+"
|
74
|
+
seq << "#{@sequences[hash[:name]]}\n"
|
75
|
+
elsif hash[:strand] == "-"
|
76
|
+
seq << "#{@sequences[hash[:name]].revcomp}\n"
|
77
|
+
else
|
78
|
+
abort "Unknown strand #{hash[:strand]}"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
|
82
|
+
cmd << "--output-order tree-order"
|
83
|
+
align = Cmd.new cmd
|
84
|
+
align.run
|
85
|
+
File.open("cluster#{id}.fa", "wb") do |out|
|
86
|
+
out.write align.stdout
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def load_scores files
|
93
|
+
scores = {}
|
94
|
+
files.each do |file|
|
95
|
+
CSV.foreach(file, :headers => true,
|
96
|
+
:header_converters => :symbol,
|
97
|
+
:converters => :all) do |row|
|
98
|
+
name = row[:contig_name]
|
99
|
+
score = row[:score]
|
100
|
+
scores[name] = score
|
101
|
+
end
|
102
|
+
end
|
103
|
+
return scores
|
104
|
+
end
|
105
|
+
|
106
|
+
def filter files, scores
|
107
|
+
filtered_files = []
|
108
|
+
files.each_with_index do |file, index|
|
109
|
+
new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
|
110
|
+
unless File.exist?(new_filename)
|
111
|
+
File.open(new_filename, "wb") do |out|
|
112
|
+
puts "opening #{file}..."
|
113
|
+
Bio::FastaFormat.open(file).each do |entry|
|
114
|
+
contig_name = entry.entry_id
|
115
|
+
contig_name = "contig#{index}_#{contig_name}"
|
116
|
+
if scores.key?(contig_name) and scores[contig_name] > 0.01
|
117
|
+
out.write ">#{contig_name}\n"
|
118
|
+
out.write "#{entry.seq}\n"
|
119
|
+
elsif !scores.key?(contig_name)
|
120
|
+
abort "Can't find '#{contig_name}' in scores"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
filtered_files << File.expand_path(new_filename)
|
126
|
+
end
|
127
|
+
return filtered_files
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
def transrate files, left, right
|
132
|
+
scores = {}
|
133
|
+
scores_file = "scores.csv"
|
134
|
+
if File.exist?(scores_file)
|
135
|
+
puts "loading scores from file" if @verbose
|
136
|
+
File.open(scores_file).each do |line|
|
137
|
+
name, score = line.chomp.split("\t")
|
138
|
+
scores[name] = score.to_f
|
139
|
+
end
|
140
|
+
else
|
141
|
+
files.each_with_index do |fasta, index|
|
142
|
+
puts "transrate on #{fasta}" if @verbose
|
143
|
+
assembly = Transrate::Assembly.new(fasta)
|
144
|
+
transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
|
145
|
+
transrater.read_metrics(left.join(','), right.join(','))
|
146
|
+
assembly.each do |name, contig|
|
147
|
+
name = "contig#{index}_#{name}"
|
148
|
+
scores[name] = contig.score
|
149
|
+
end
|
150
|
+
end
|
151
|
+
File.open(scores_file, "wb") do |out|
|
152
|
+
scores.each do |name, score|
|
153
|
+
out.write "#{name}\t#{score}\n"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
return scores
|
158
|
+
end
|
159
|
+
|
160
|
+
def select_contigs clusters, scores
|
161
|
+
puts "selecting contigs" if @verbose
|
162
|
+
best = []
|
163
|
+
clusters.each do |cluster_id, list|
|
164
|
+
best_score = 0
|
165
|
+
best_contig = ""
|
166
|
+
list.each do |contig_name|
|
167
|
+
unless scores[contig_name]
|
168
|
+
abort "can't find #{contig_name} in scores hash\n"
|
169
|
+
end
|
170
|
+
if scores[contig_name] > best_score
|
171
|
+
best_score = scores[contig_name]
|
172
|
+
best_contig = contig_name
|
173
|
+
end
|
174
|
+
end
|
175
|
+
best << best_contig
|
176
|
+
end
|
177
|
+
return best
|
178
|
+
end
|
179
|
+
|
180
|
+
def output_contigs best, fasta, output
|
181
|
+
puts "writing contigs" if @verbose
|
182
|
+
# read in catted fasta sequences
|
183
|
+
sequences = {}
|
184
|
+
Bio::FastaFormat.open(fasta).each do |entry|
|
185
|
+
sequences[entry.entry_id] = entry.seq
|
186
|
+
end
|
187
|
+
File.open(output, "wb") do |out|
|
188
|
+
best.each do |contig_name|
|
189
|
+
if sequences.key?(contig_name)
|
190
|
+
out.write ">#{contig_name}\n"
|
191
|
+
out.write "#{sequences[contig_name]}\n"
|
192
|
+
else
|
193
|
+
puts "can't find #{contig_name} in #{fasta}"
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Transfuse
|
2
|
+
|
3
|
+
# Defines the version of this codebase.
|
4
|
+
#
|
5
|
+
# This module is used in help messages and in generating
|
6
|
+
# the Gem. Versions must be incremented in accordance with
|
7
|
+
# Semantic Versioning 2.0 (http://semver.org/).
|
8
|
+
module VERSION
|
9
|
+
MAJOR = 0
|
10
|
+
MINOR = 1
|
11
|
+
PATCH = 1
|
12
|
+
BUILD = nil
|
13
|
+
|
14
|
+
STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
15
|
+
end
|
16
|
+
|
17
|
+
end # Transfuse
|
data/lib/transfuse.rb
ADDED
data/notes.md
ADDED