big_simon 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +13 -1
- data/Gemfile.lock +7 -3
- data/Makefile +8 -0
- data/README.md +8 -2
- data/big_simon.gemspec +3 -1
- data/exe/big_simon +244 -0
- data/exe/ranks +93 -0
- data/lib/big_simon/parsers.rb +78 -0
- data/lib/big_simon/pipeline.rb +64 -0
- data/lib/big_simon/runners.rb +189 -0
- data/lib/big_simon/utils.rb +108 -0
- data/lib/big_simon/version.rb +13 -1
- data/lib/big_simon.rb +14 -87
- data/vendor/bin/mac/mummer +0 -0
- data/vendor/bin/mac/vhm.py +10 -9
- metadata +55 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
|
4
|
+
data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
|
7
|
+
data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
big_simon (0.0
|
5
|
-
|
4
|
+
big_simon (0.1.0)
|
5
|
+
parse_fasta (~> 2.5, >= 2.5.2)
|
6
|
+
rya (~> 0.4.0)
|
7
|
+
trollop (~> 2.1, >= 2.1.3)
|
6
8
|
|
7
9
|
GEM
|
8
10
|
remote: https://rubygems.org/
|
9
11
|
specs:
|
10
12
|
abort_if (0.2.0)
|
11
13
|
diff-lcs (1.3)
|
14
|
+
parse_fasta (2.5.2)
|
12
15
|
rake (10.5.0)
|
13
16
|
rspec (3.7.0)
|
14
17
|
rspec-core (~> 3.7.0)
|
@@ -23,10 +26,11 @@ GEM
|
|
23
26
|
diff-lcs (>= 1.2.0, < 2.0)
|
24
27
|
rspec-support (~> 3.7.0)
|
25
28
|
rspec-support (3.7.1)
|
26
|
-
rya (0.
|
29
|
+
rya (0.4.0)
|
27
30
|
abort_if (~> 0.2.0)
|
28
31
|
systemu (~> 2.6, >= 2.6.5)
|
29
32
|
systemu (2.6.5)
|
33
|
+
trollop (2.1.3)
|
30
34
|
|
31
35
|
PLATFORMS
|
32
36
|
ruby
|
data/Makefile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
test_small:
|
2
|
+
rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
3
|
+
|
4
|
+
test_small_install:
|
5
|
+
rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
6
|
+
|
7
|
+
test_toy:
|
8
|
+
rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# BigSimon
|
2
2
|
|
3
|
-
|
3
|
+
Hi, I'm BigSimon (but you can call me BigSi if you want), and I'm a pipeline for finding hosts of viruses!
|
4
|
+
|
5
|
+
Mainly, I'm just a wrapper for some other nice tools.
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -20,7 +22,11 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
## Usage
|
22
24
|
|
23
|
-
|
25
|
+
For now, there is not much documentation. To see the help file, run:
|
26
|
+
|
27
|
+
```
|
28
|
+
$ big_simon --help
|
29
|
+
```
|
24
30
|
|
25
31
|
## Development
|
26
32
|
|
data/big_simon.gemspec
CHANGED
@@ -26,5 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
27
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
28
|
|
29
|
-
spec.add_runtime_dependency "
|
29
|
+
spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
|
30
|
+
spec.add_runtime_dependency "rya", "~> 0.4.0"
|
31
|
+
spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
|
30
32
|
end
|
data/exe/big_simon
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Signal.trap("PIPE", "EXIT")
|
3
|
+
|
4
|
+
require "pp"
|
5
|
+
require "tempfile"
|
6
|
+
|
7
|
+
require "parse_fasta"
|
8
|
+
require "trollop"
|
9
|
+
|
10
|
+
require "big_simon"
|
11
|
+
|
12
|
+
# TODO make scaled scores with high score being better.
|
13
|
+
|
14
|
+
Process.extend Rya::CoreExtensions::Process
|
15
|
+
|
16
|
+
opts = Trollop.options do
|
17
|
+
version BigSimon::VERSION_BANNER
|
18
|
+
|
19
|
+
banner <<-EOS
|
20
|
+
|
21
|
+
#{BigSimon::VERSION_BANNER}
|
22
|
+
|
23
|
+
Hi, I'm BigSimon! I'm here to help you figure out the hosts for
|
24
|
+
your viruses.
|
25
|
+
|
26
|
+
I run a bunch of different programs. In addition to doing some
|
27
|
+
merging of results, I'll give you heatmaps for all the programs and
|
28
|
+
you can check for yourself.
|
29
|
+
|
30
|
+
The scaled scores run from 0 to 1 with lower scores being better.
|
31
|
+
|
32
|
+
Options:
|
33
|
+
EOS
|
34
|
+
|
35
|
+
opt :viruses, "Path to fasta file(s) with viruses", type: :strings
|
36
|
+
opt :hosts, "Path to fasta file(s) with hosts", type: :strings
|
37
|
+
opt :outdir, "Output directory", default: "big_simon"
|
38
|
+
opt :threads, "Number of threads to use", default: 1
|
39
|
+
end
|
40
|
+
|
41
|
+
Rya::AbortIf.logger.debug { "Command line opts: #{opts.inspect}" }
|
42
|
+
|
43
|
+
BigSimon::Utils.check_opt! opts, :viruses
|
44
|
+
BigSimon::Utils.check_opt! opts, :hosts
|
45
|
+
|
46
|
+
# Check infiles
|
47
|
+
[opts[:viruses], opts[:hosts]].flatten.each do |fname|
|
48
|
+
BigSimon::Utils.check_file! fname
|
49
|
+
end
|
50
|
+
|
51
|
+
Rya::AbortIf.abort_unless opts[:threads] > 0,
|
52
|
+
"--threads must be > 0"
|
53
|
+
|
54
|
+
programs = [
|
55
|
+
"WIsH",
|
56
|
+
"VirHostMatcher",
|
57
|
+
"mummer"
|
58
|
+
]
|
59
|
+
|
60
|
+
outdir = opts[:outdir]
|
61
|
+
threads = opts[:threads]
|
62
|
+
virus_fnames = opts[:viruses]
|
63
|
+
host_fnames = opts[:hosts]
|
64
|
+
|
65
|
+
FileUtils.mkdir_p outdir
|
66
|
+
|
67
|
+
tmpdir = File.join opts[:outdir], "big_simon_tmp"
|
68
|
+
tmpdir_virus = File.join tmpdir, "virus"
|
69
|
+
tmpdir_host = File.join tmpdir, "host"
|
70
|
+
|
71
|
+
# all_predictions_fname = File.join outdir, "scores_all.txt"
|
72
|
+
mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
|
73
|
+
|
74
|
+
virus_recs, host_recs = [], []
|
75
|
+
|
76
|
+
# Tempfile.open do |vir_f|
|
77
|
+
# Tempfile.open do |host_f|
|
78
|
+
# virus_fnames.each do |fname|
|
79
|
+
# ParseFasta::SeqFile.open(fname).each_record do |rec|
|
80
|
+
# vir_f.puts rec
|
81
|
+
#
|
82
|
+
# vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
|
83
|
+
# end
|
84
|
+
# end
|
85
|
+
#
|
86
|
+
# host_fnames.each do |fname|
|
87
|
+
# ParseFasta::SeqFile.open(fname).each_record do |rec|
|
88
|
+
# host_f.puts rec
|
89
|
+
# host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
|
90
|
+
# end
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# vir_f.fsync
|
94
|
+
# host_f.fsync
|
95
|
+
#
|
96
|
+
# cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
|
97
|
+
# Process.run_and_time_it! "MUMMER", cmd
|
98
|
+
# end
|
99
|
+
# end
|
100
|
+
#
|
101
|
+
# header = nil
|
102
|
+
# hits = []
|
103
|
+
# hit_info = {}
|
104
|
+
# virus = nil
|
105
|
+
#
|
106
|
+
# File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
|
107
|
+
# if line.start_with? '>'
|
108
|
+
# virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
109
|
+
#
|
110
|
+
# unless hit_info.has_key? virus
|
111
|
+
# hit_info[virus] = {}
|
112
|
+
# end
|
113
|
+
# else
|
114
|
+
# host, _, _, len = line.chomp.strip.split(" ")
|
115
|
+
# host = host.sub(/___reverse$/, "").strip
|
116
|
+
#
|
117
|
+
# unless hit_info[virus].has_key? host
|
118
|
+
# hit_info[virus][host] = -1
|
119
|
+
# end
|
120
|
+
#
|
121
|
+
# hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
|
122
|
+
# end
|
123
|
+
# end
|
124
|
+
#
|
125
|
+
# puts
|
126
|
+
#
|
127
|
+
# hh = hit_info.map do |virus, info|
|
128
|
+
# [virus, info.to_a.sort_by {|gen, len| len}.reverse]
|
129
|
+
# end
|
130
|
+
#
|
131
|
+
# pp hh
|
132
|
+
|
133
|
+
# hh = hit_info.map do |virus, info|
|
134
|
+
# [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
|
135
|
+
#
|
136
|
+
# end
|
137
|
+
# p hit_info
|
138
|
+
|
139
|
+
scores_files = {}
|
140
|
+
programs.each do |program|
|
141
|
+
raw_fname = File.join outdir, "scores_raw.#{program}.txt"
|
142
|
+
scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
|
143
|
+
|
144
|
+
scores_files[program] = {
|
145
|
+
raw: File.open(raw_fname, "w"),
|
146
|
+
scaled: File.open(scaled_fname, "w")
|
147
|
+
}
|
148
|
+
end
|
149
|
+
|
150
|
+
scores_files.each do |program, files|
|
151
|
+
files.each do |name, file|
|
152
|
+
file.puts %w[virus host score].join "\t"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
|
157
|
+
name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
|
158
|
+
|
159
|
+
wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
|
160
|
+
vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
|
161
|
+
|
162
|
+
host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
|
163
|
+
|
164
|
+
# Map them back to simple names. TODO just have it spit these out from the beginning.
|
165
|
+
host_info_mummer_simple_names = {}
|
166
|
+
inverted_name_map_virus = name_map_virus.invert
|
167
|
+
inverted_name_map_host = name_map_host.invert
|
168
|
+
|
169
|
+
host_info_mummer.each do |virus, host_tables|
|
170
|
+
virname = virus
|
171
|
+
if inverted_name_map_virus.has_key? virus
|
172
|
+
virname = inverted_name_map_virus[virus]
|
173
|
+
end
|
174
|
+
|
175
|
+
host_info_mummer_simple_names[virname] = []
|
176
|
+
|
177
|
+
host_tables.map do |table|
|
178
|
+
hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
|
179
|
+
new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
|
180
|
+
|
181
|
+
host_info_mummer_simple_names[virname] << new_table
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
host_info_wish = BigSimon::Parsers.wish wish_outf
|
186
|
+
host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
|
187
|
+
|
188
|
+
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
|
189
|
+
host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
|
190
|
+
|
191
|
+
# puts
|
192
|
+
# pp host_info
|
193
|
+
# puts
|
194
|
+
|
195
|
+
# Just a basic all info file
|
196
|
+
# File.open all_predictions_fname, "w" do |f|
|
197
|
+
# f.puts %w[virus host program score scaled.score].join "\t"
|
198
|
+
|
199
|
+
host_info.each do |virus, h1|
|
200
|
+
h1.each do |host, h2|
|
201
|
+
lines = {}
|
202
|
+
|
203
|
+
h2[:scores].each do |program, score|
|
204
|
+
lines[[virus, host, program]] = [score]
|
205
|
+
|
206
|
+
scores_files[program][:raw].puts [virus, host, score].join "\t"
|
207
|
+
end
|
208
|
+
|
209
|
+
# Add in the scaled score too.
|
210
|
+
h2[:scaled_scores].each do |program, score|
|
211
|
+
lines[[virus, host, program]] << score
|
212
|
+
|
213
|
+
scores_files[program][:scaled].puts [virus, host, score].join "\t"
|
214
|
+
end
|
215
|
+
|
216
|
+
# lines.each do |(virus, host, program), (score, scaled_score)|
|
217
|
+
# f.puts [virus, host, program, score, scaled_score].join "\t"
|
218
|
+
# end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
# end
|
222
|
+
|
223
|
+
# A file with mean scaled scores.
|
224
|
+
File.open mean_scaled_scores_fname, "w" do |f|
|
225
|
+
f.puts %w[virus host score].join "\t"
|
226
|
+
|
227
|
+
host_info.each do |virus, h1|
|
228
|
+
h1.each do |host, h2|
|
229
|
+
scaled_scores = h2[:scaled_scores].values
|
230
|
+
|
231
|
+
mean_scaled_score = scaled_scores.reduce(:+) / scaled_scores.length.to_f
|
232
|
+
|
233
|
+
f.puts [virus, host, mean_scaled_score].join "\t"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
scores_files.each do |program, file|
|
239
|
+
file.values.map(&:close)
|
240
|
+
end
|
241
|
+
FileUtils.rm_r tmpdir
|
242
|
+
|
243
|
+
# Make the heatmaps
|
244
|
+
BigSimon::Runners.heatmaps BigSimon::RSCRIPT, outdir, File.join(outdir, "heatmaps")
|
data/exe/ranks
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Signal.trap("PIPE", "EXIT")
|
3
|
+
|
4
|
+
require "pp"
|
5
|
+
|
6
|
+
by_program = {}
|
7
|
+
lines = []
|
8
|
+
|
9
|
+
TOP = (ENV["TOP"] || 5).to_i
|
10
|
+
|
11
|
+
ARGV.each do |fname|
|
12
|
+
scores = {}
|
13
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
14
|
+
unless idx.zero?
|
15
|
+
virus, host, score = line.chomp.split "\t"
|
16
|
+
|
17
|
+
unless scores.has_key? virus
|
18
|
+
scores[virus] = []
|
19
|
+
end
|
20
|
+
|
21
|
+
scores[virus] << [host, score.to_f]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
scores.sort_by { |virus, _| virus }.each do |virus, host_scores|
|
26
|
+
# Lowest score is the best
|
27
|
+
top_5 = host_scores.sort_by { |host, score| score }.take(TOP).map(&:first)
|
28
|
+
|
29
|
+
line = [File.basename(fname), virus, top_5]
|
30
|
+
lines << line
|
31
|
+
|
32
|
+
# puts line.join "\t"
|
33
|
+
end
|
34
|
+
# puts
|
35
|
+
end
|
36
|
+
|
37
|
+
lines.each do |line|
|
38
|
+
program, virus, all = line
|
39
|
+
first = all.first
|
40
|
+
|
41
|
+
unless by_program.has_key? virus
|
42
|
+
by_program[virus] = {}
|
43
|
+
end
|
44
|
+
|
45
|
+
unless program == "scores_scaled.mean.txt"
|
46
|
+
by_program[virus][program] = { first: first, all: all }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# These track the number of times a host shows up in the first spot and in the top N spots for that virus for all programs.
|
51
|
+
first_table = {}
|
52
|
+
top_host_table = {}
|
53
|
+
|
54
|
+
by_program.each do |virus, program_tables|
|
55
|
+
first_table[virus] = {}
|
56
|
+
top_host_table[virus] = {}
|
57
|
+
|
58
|
+
program_tables.each do |program, top_info|
|
59
|
+
|
60
|
+
|
61
|
+
first_host = top_info[:first]
|
62
|
+
all_top = top_info[:all]
|
63
|
+
|
64
|
+
unless first_table[virus].has_key? first_host
|
65
|
+
first_table[virus][first_host] = []
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
first_table[virus][first_host] << program
|
70
|
+
|
71
|
+
all_top.each do |top_host|
|
72
|
+
unless top_host_table[virus].has_key? top_host
|
73
|
+
top_host_table[virus][top_host] = []
|
74
|
+
end
|
75
|
+
|
76
|
+
top_host_table[virus][top_host] << program
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
first_table.each do |virus, host_counts|
|
82
|
+
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
83
|
+
STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
|
84
|
+
end
|
85
|
+
STDERR.puts
|
86
|
+
end
|
87
|
+
|
88
|
+
top_host_table.each do |virus, host_counts|
|
89
|
+
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
90
|
+
puts [virus, :top_N, host, programs.count, programs].join "\t"
|
91
|
+
end
|
92
|
+
puts
|
93
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module BigSimon
|
2
|
+
# Methods for parsing output files
|
3
|
+
class Parsers
|
4
|
+
|
5
|
+
# @note VirHostMatcher returns true distances that run from 0 to 1, so it doesn't need scaling.
|
6
|
+
# @note VirHostMatcher includes the whole file name as the id of the organism, so we chop off some common endings.
|
7
|
+
def self.vir_host_matcher fname
|
8
|
+
hosts = nil
|
9
|
+
|
10
|
+
host_info = {}
|
11
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
12
|
+
line.chomp!
|
13
|
+
line.sub! /,$/, "" # git rid of trailing commas
|
14
|
+
|
15
|
+
if idx.zero?
|
16
|
+
stat, *hosts = line.split ","
|
17
|
+
|
18
|
+
hosts.map! { |str| BigSimon::Utils.strip_suffix str }
|
19
|
+
else
|
20
|
+
ary = line.split ","
|
21
|
+
virus = BigSimon::Utils.strip_suffix ary.shift
|
22
|
+
|
23
|
+
# In this case the best value is the lowest distance.
|
24
|
+
dists = ary.map.
|
25
|
+
with_index do |dist, idx|
|
26
|
+
{ host: hosts[idx], score: dist.to_f, scaled_score: dist.to_f }
|
27
|
+
end.sort_by { |ht| ht[:scaled_score] }
|
28
|
+
|
29
|
+
|
30
|
+
host_info[virus] = dists
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
host_info
|
35
|
+
end
|
36
|
+
|
37
|
+
# @note WIsH gives log likelihoods so the scaled value is actually scaled.
|
38
|
+
# @note The viruses and hosts will have the ID rather than the file name.
|
39
|
+
def self.wish fname
|
40
|
+
viruses = nil
|
41
|
+
|
42
|
+
host_info = {}
|
43
|
+
|
44
|
+
hosts = nil
|
45
|
+
File.open(fname, "rt").each_line.map.with_index do |line, idx|
|
46
|
+
line.chomp!
|
47
|
+
|
48
|
+
if idx.zero?
|
49
|
+
ary = line.split("\t")
|
50
|
+
ary.unshift("")
|
51
|
+
else
|
52
|
+
ary = line.split("\t")
|
53
|
+
end
|
54
|
+
end.transpose.each_with_index do |line_ary, idx|
|
55
|
+
if idx.zero?
|
56
|
+
hosts = line_ary.drop(1)
|
57
|
+
else
|
58
|
+
virus = line_ary.shift
|
59
|
+
|
60
|
+
scores = line_ary.map(&:to_f)
|
61
|
+
|
62
|
+
host_vals = scores.map.with_index do |score, idx|
|
63
|
+
{ host: hosts[idx], score: score, scaled_score: 1 - Math.exp(score) }
|
64
|
+
end
|
65
|
+
|
66
|
+
host_info[virus] = host_vals
|
67
|
+
end
|
68
|
+
|
69
|
+
host_info.each do |virus, hosts|
|
70
|
+
hosts.sort_by! { |ht| ht[:scaled_score] }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
host_info
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module BigSimon
|
2
|
+
class Pipeline
|
3
|
+
# @param collated_results_table { virus => host => score_type => program => score }
|
4
|
+
def self.map_taxa collated_results_table, virus_name_map, host_name_map
|
5
|
+
new_results_table = {}
|
6
|
+
|
7
|
+
collated_results_table.each do |virus_name, host_table|
|
8
|
+
if virus_name_map.include? virus_name
|
9
|
+
new_virus_name = virus_name_map[virus_name]
|
10
|
+
else
|
11
|
+
new_virus_name = virus_name
|
12
|
+
end
|
13
|
+
|
14
|
+
new_results_table[new_virus_name] = {}
|
15
|
+
|
16
|
+
host_table.each do |host_name, score_table|
|
17
|
+
if host_name_map.include? host_name
|
18
|
+
new_host_name = host_name_map[host_name]
|
19
|
+
else
|
20
|
+
new_host_name = host_name
|
21
|
+
end
|
22
|
+
|
23
|
+
new_results_table[new_virus_name][new_host_name] = score_table
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
new_results_table
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param [Array<Hash>] results_table host info hash tables. See functions in Parsers class.
|
31
|
+
# @param [Array<String>] programs names of programs generating hash tables (in same order as host_data)
|
32
|
+
def self.collate_host_results results_table, programs
|
33
|
+
Rya::AbortIf.assert results_table.count == programs.count
|
34
|
+
|
35
|
+
virus_host_scores = {}
|
36
|
+
all_viruses = results_table.reduce(Set.new) { |acc, ht| acc + ht.keys }
|
37
|
+
|
38
|
+
all_viruses.each do |virus|
|
39
|
+
virus_host_scores[virus] = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
results_table.each_with_index do |ht, idx|
|
43
|
+
program = programs[idx]
|
44
|
+
|
45
|
+
ht.each do |virus, host_scores|
|
46
|
+
host_scores.each do |ht|
|
47
|
+
host = ht[:host]
|
48
|
+
score = ht[:score]
|
49
|
+
scaled_score = ht[:scaled_score]
|
50
|
+
|
51
|
+
unless virus_host_scores[virus].has_key? host
|
52
|
+
virus_host_scores[virus][host] = { scores: {}, scaled_scores: {}}
|
53
|
+
end
|
54
|
+
|
55
|
+
virus_host_scores[virus][host][:scores][program] = score
|
56
|
+
virus_host_scores[virus][host][:scaled_scores][program] = scaled_score
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
virus_host_scores
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
|
3
|
+
module BigSimon
|
4
|
+
class Runners
|
5
|
+
|
6
|
+
# This one's a bit different as it parses as well and returns original names.
|
7
|
+
# @todo Also do the reverse of each genome in case it's a contig.
|
8
|
+
def self.mummer exe, vir_dir, host_dir, outdir, threads
|
9
|
+
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
|
+
FileUtils.mkdir_p outdir
|
11
|
+
|
12
|
+
# TODO put these all in one file then do it?
|
13
|
+
|
14
|
+
results = {}
|
15
|
+
|
16
|
+
# Takes names in files and puts them to the file names
|
17
|
+
name_map = {}
|
18
|
+
|
19
|
+
Dir.glob(vir_dir + "/*").each do |vir_fname|
|
20
|
+
this_virus_scores = []
|
21
|
+
virus = nil
|
22
|
+
|
23
|
+
Dir.glob(host_dir + "/*").each do |host_fname|
|
24
|
+
vir_base = File.basename vir_fname
|
25
|
+
host_base = File.basename host_fname
|
26
|
+
outfname = File.join outdir, "#{vir_base}___#{host_base}.mummer"
|
27
|
+
|
28
|
+
# -l is min length of a match TODO pull this into a const
|
29
|
+
# -F to force 4 columns
|
30
|
+
cmd = "#{exe} -F " \
|
31
|
+
"-maxmatch " \
|
32
|
+
"-l 15 " \
|
33
|
+
"#{host_fname} " \
|
34
|
+
"#{vir_fname} " \
|
35
|
+
"> #{outfname}"
|
36
|
+
|
37
|
+
Process.run_and_time_it! "Calculating matches", cmd
|
38
|
+
|
39
|
+
# Note there should only be one '>' per file here.
|
40
|
+
host = nil
|
41
|
+
score = 0
|
42
|
+
File.open(outfname, "rt").each_line.with_index do |line, idx|
|
43
|
+
if idx.zero?
|
44
|
+
this_virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
45
|
+
|
46
|
+
Rya::AbortIf::abort_unless(this_virus == virus, "OOPS") if virus
|
47
|
+
|
48
|
+
virus ||= this_virus
|
49
|
+
else
|
50
|
+
ary = line.chomp.strip.split(" ")
|
51
|
+
Rya::AbortIf.abort_unless ary.count == 4, "Problem parsing #{outfname} (mummer output)"
|
52
|
+
|
53
|
+
host = ary[0].sub(/___reverse$/, "").strip
|
54
|
+
len = ary[3].to_i
|
55
|
+
|
56
|
+
score = len if len > score
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
this_virus_scores << score
|
61
|
+
|
62
|
+
unless results.has_key? virus
|
63
|
+
results[virus] = []
|
64
|
+
end
|
65
|
+
|
66
|
+
results[virus] << { host: host, score: score, scaled_score: nil }
|
67
|
+
|
68
|
+
FileUtils.rm outfname
|
69
|
+
end
|
70
|
+
|
71
|
+
# This was the original scaling, i.e. per virus
|
72
|
+
# min = 0 # this_virus_scores.min # Technically, this should range from 0 to 15. Any data missing from this table would give a zero. TODO we don't actually account for this though.
|
73
|
+
# max = this_virus_scores.max
|
74
|
+
# from = 1
|
75
|
+
# to = 0
|
76
|
+
#
|
77
|
+
# results[virus].each do |host_table|
|
78
|
+
# host_table[:scaled_score] = klass.scale host_table[:score], min, max, from, to
|
79
|
+
# end
|
80
|
+
end
|
81
|
+
|
82
|
+
all_scores = []
|
83
|
+
results.each do |virus, host_tables|
|
84
|
+
all_scores << host_tables.map { |table| table[:score] }
|
85
|
+
end
|
86
|
+
|
87
|
+
all_scores.flatten!
|
88
|
+
max = all_scores.max
|
89
|
+
|
90
|
+
results.each do |virus, host_tables|
|
91
|
+
host_tables.each do |host_table|
|
92
|
+
host_table[:scaled_score] = klass.scale host_table[:score], 0, max, 1, 0
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
results
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
|
100
|
+
FileUtils.mkdir_p outdir
|
101
|
+
|
102
|
+
cmd = "python #{exe} " \
|
103
|
+
"-v #{vir_dir} " \
|
104
|
+
"-b #{host_dir} " \
|
105
|
+
"-o #{outdir} " \
|
106
|
+
"-d 1" # only compute d2star dissimilarity
|
107
|
+
|
108
|
+
Process.run_and_time_it! "Computing d2star dissimilarity", cmd
|
109
|
+
|
110
|
+
tmp_dir = File.join outdir, "tmp"
|
111
|
+
FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
|
112
|
+
|
113
|
+
bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
|
114
|
+
bad_files.each do |fname|
|
115
|
+
path = File.join outdir, fname
|
116
|
+
|
117
|
+
FileUtils.rm path if File.exist? path
|
118
|
+
end
|
119
|
+
|
120
|
+
outf = File.join outdir, "d2star_k6.csv"
|
121
|
+
new_outf = File.join outdir, "vir_host_matcher.txt"
|
122
|
+
FileUtils.mv outf, new_outf
|
123
|
+
|
124
|
+
new_outf
|
125
|
+
end
|
126
|
+
|
127
|
+
# Runs the WIsH program
|
128
|
+
#
|
129
|
+
# @raise [AbortIf::Exit] if commands fail
|
130
|
+
def self.wish exe, vir_dir, host_dir, outdir, threads
|
131
|
+
model_dir = File.join outdir, "model"
|
132
|
+
|
133
|
+
FileUtils.mkdir_p model_dir
|
134
|
+
|
135
|
+
build_model = "#{exe} " \
|
136
|
+
"-t #{threads} " \
|
137
|
+
"-c build " \
|
138
|
+
"-g #{host_dir} " \
|
139
|
+
"-m #{model_dir}"
|
140
|
+
|
141
|
+
predict = "#{exe} " \
|
142
|
+
"-t #{threads} " \
|
143
|
+
"-c predict " \
|
144
|
+
"-g #{vir_dir} " \
|
145
|
+
"-m #{model_dir} " \
|
146
|
+
"-r #{outdir}"
|
147
|
+
|
148
|
+
Process.run_and_time_it! "Building model", build_model
|
149
|
+
Process.run_and_time_it! "Predicting host", predict
|
150
|
+
|
151
|
+
FileUtils.rm_r model_dir if Dir.exist? model_dir
|
152
|
+
|
153
|
+
outf = File.join outdir, "llikelihood.matrix"
|
154
|
+
new_outf = File.join outdir, "wish.txt"
|
155
|
+
FileUtils.mv outf, new_outf
|
156
|
+
|
157
|
+
new_outf
|
158
|
+
end
|
159
|
+
|
160
|
+
def self.heatmaps exe, indir, outdir
|
161
|
+
FileUtils.mkdir_p outdir
|
162
|
+
|
163
|
+
fnames = Dir.glob("#{indir}/scores*.txt").map do |in_fname|
|
164
|
+
extname = File.extname in_fname
|
165
|
+
basename = File.basename in_fname, extname
|
166
|
+
|
167
|
+
out_fname = File.join outdir, "#{basename}.heatmap.pdf"
|
168
|
+
|
169
|
+
[in_fname, out_fname]
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
rcode_str = BigSimon::Utils.rcode fnames
|
174
|
+
|
175
|
+
Object::Tempfile.open do |f|
|
176
|
+
f.puts rcode_str
|
177
|
+
f.fsync # ensure no data is buffered
|
178
|
+
|
179
|
+
|
180
|
+
cmd = "#{exe} #{f.path}"
|
181
|
+
Process.run_and_time_it! "Drawing heatmaps", cmd
|
182
|
+
end
|
183
|
+
|
184
|
+
out_fnames = fnames.map(&:last)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module BigSimon
|
2
|
+
# @todo These don't have unit tests yet.
|
3
|
+
# @note Skips any duplicate IDs. Only keeps the first one.
|
4
|
+
class Utils
|
5
|
+
def self.check_file! fname
|
6
|
+
Rya::AbortIf.abort_if fname && !File.exist?(fname),
|
7
|
+
"#{fname} doesn't exist! Try big_simon --help for help."
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.check_opt! opts, arg
|
11
|
+
Rya::AbortIf.abort_unless opts.send(:fetch, "#{arg}_given".to_sym),
|
12
|
+
"You must specify --#{arg.to_s.tr('_', '-')}. Try big_simon --help for help."
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.rcode fnames
|
16
|
+
functions = %Q|
|
17
|
+
library(reshape2)
|
18
|
+
library(gplots)
|
19
|
+
library(RColorBrewer)
|
20
|
+
|
21
|
+
file.join <- function(...) {
|
22
|
+
paste(..., sep="/")
|
23
|
+
}
|
24
|
+
|
25
|
+
draw.heatmap <- function(infname, outfname) {
|
26
|
+
dat <- read.table(infname, header=T, sep="\t")
|
27
|
+
|
28
|
+
wide.dat <- dcast(dat, host ~ virus, value.var="score")
|
29
|
+
|
30
|
+
hosts <- wide.dat[, 1]
|
31
|
+
scores <- wide.dat[, 2:ncol(wide.dat)]
|
32
|
+
scores.numeric <- apply(scores, 2, as.numeric)
|
33
|
+
|
34
|
+
scores.matrix <- as.matrix(scores.numeric)
|
35
|
+
|
36
|
+
rownames(scores.matrix) <- hosts
|
37
|
+
|
38
|
+
palette <- "YlOrBr"
|
39
|
+
col <- colorRampPalette(brewer.pal(n=9, palette))(n = 25)
|
40
|
+
size <- 0.75
|
41
|
+
|
42
|
+
pdf(outfname, height=5, width=8)
|
43
|
+
|
44
|
+
heatmap.2(scores.matrix,
|
45
|
+
trace="none", ## Disable those wonky lines.
|
46
|
+
col=col, ## Set the color.
|
47
|
+
|
48
|
+
## Size opts
|
49
|
+
margins=c(11, 11), cexRow=size, cexCol=size,
|
50
|
+
|
51
|
+
## Key labeling
|
52
|
+
key.xlab="Score")
|
53
|
+
|
54
|
+
invisible(dev.off())
|
55
|
+
}
|
56
|
+
|
57
|
+
|
|
58
|
+
|
59
|
+
drawing = fnames.map do |in_fname, out_fname|
|
60
|
+
%Q{
|
61
|
+
|
62
|
+
draw.heatmap("#{in_fname}", "#{out_fname}")
|
63
|
+
}
|
64
|
+
end.join
|
65
|
+
|
66
|
+
[functions, drawing].join "\n"
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.scale_log_likelihood ll
|
70
|
+
1 - Math.exp(ll)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.set_up_tmp_dirs fastas, tmpdir, which
|
74
|
+
Object::FileUtils.mkdir_p tmpdir
|
75
|
+
|
76
|
+
name_map = {}
|
77
|
+
all_ids = Set.new
|
78
|
+
|
79
|
+
seq_num = -1
|
80
|
+
fastas.each do |fname|
|
81
|
+
ParseFasta::SeqFile.open(fname).each_record do |rec|
|
82
|
+
if all_ids.include? rec.id
|
83
|
+
Rya::AbortIf.logger.warn { "#{rec.id} was seen more than one time! Duplicate organism IDs are not allowed, so we will only keep the first one." }
|
84
|
+
else
|
85
|
+
all_ids << rec.id
|
86
|
+
|
87
|
+
seq_num += 1
|
88
|
+
|
89
|
+
new_id = "#{which}_#{seq_num}"
|
90
|
+
name_map[new_id] = rec.id
|
91
|
+
|
92
|
+
outfname = File.join tmpdir, "#{new_id}.fa"
|
93
|
+
|
94
|
+
File.open(outfname, "w") do |f|
|
95
|
+
f.puts rec
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
[name_map, all_ids]
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.strip_suffix fname
|
105
|
+
fname.sub /.fasta$|.fa$/, ""
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/big_simon/version.rb
CHANGED
@@ -1,3 +1,15 @@
|
|
1
1
|
module BigSimon
|
2
|
-
VERSION = "0.0
|
2
|
+
VERSION = "0.1.0"
|
3
|
+
|
4
|
+
COPYRIGHT = "2018 Ryan Moore"
|
5
|
+
CONTACT = "moorer@udel.edu"
|
6
|
+
WEBSITE = "https://github.com/mooreryan/InteinFinder"
|
7
|
+
LICENSE = "GPLv3"
|
8
|
+
|
9
|
+
VERSION_BANNER =
|
10
|
+
" # Version: v#{VERSION}
|
11
|
+
# Copyright: #{COPYRIGHT}
|
12
|
+
# Contact: #{CONTACT}
|
13
|
+
# License: #{LICENSE}"
|
14
|
+
|
3
15
|
end
|
data/lib/big_simon.rb
CHANGED
@@ -1,100 +1,27 @@
|
|
1
1
|
require "rya"
|
2
|
+
require "set"
|
3
|
+
require "pp"
|
2
4
|
|
3
5
|
require "big_simon/version"
|
4
6
|
|
7
|
+
require "big_simon/utils"
|
8
|
+
|
9
|
+
require "big_simon/runners"
|
10
|
+
require "big_simon/parsers"
|
11
|
+
require "big_simon/pipeline"
|
12
|
+
|
5
13
|
Time.extend Rya::CoreExtensions::Time
|
6
14
|
Process.extend Rya::CoreExtensions::Process
|
15
|
+
Array.include Rya::CoreExtensions::Array
|
16
|
+
Math.extend Rya::CoreExtensions::Math
|
7
17
|
|
8
18
|
module BigSimon
|
9
|
-
|
10
|
-
# Project directories
|
11
19
|
ROOT = File.join __dir__, ".."
|
12
20
|
BIN = File.join ROOT, "vendor", "bin", "mac"
|
13
21
|
SPEC = File.join ROOT, "spec"
|
14
22
|
TEST_FILES = File.join SPEC, "test_files"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
hosts = nil
|
20
|
-
|
21
|
-
host_info = {}
|
22
|
-
File.open(fname, "rt").each_line.with_index do |line, idx|
|
23
|
-
line.chomp!
|
24
|
-
line.sub! /,$/, "" # git rid of trailing commas
|
25
|
-
|
26
|
-
if idx.zero?
|
27
|
-
stat, *hosts = line.split ","
|
28
|
-
else
|
29
|
-
ary = line.split ","
|
30
|
-
virus = ary.shift
|
31
|
-
|
32
|
-
dists = ary.map.
|
33
|
-
with_index { |dist, idx| [hosts[idx], dist.to_f] }.
|
34
|
-
sort_by { |_, dist| dist }
|
35
|
-
|
36
|
-
best_host = dists[0][0]
|
37
|
-
|
38
|
-
host_info[virus] = {
|
39
|
-
best: best_host,
|
40
|
-
all: dists
|
41
|
-
}
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
host_info
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
class Runners
|
50
|
-
|
51
|
-
# Runs the WIsH program
|
52
|
-
#
|
53
|
-
# @raise [AbortIf::Exit] if commands fail
|
54
|
-
def self.wish exe, vir_dir, host_dir, outdir, threads
|
55
|
-
model_dir = File.join outdir, "model"
|
56
|
-
|
57
|
-
FileUtils.mkdir_p model_dir
|
58
|
-
|
59
|
-
build_model = "#{exe} " \
|
60
|
-
"-t #{threads} " \
|
61
|
-
"-c build " \
|
62
|
-
"-g #{host_dir} " \
|
63
|
-
"-m #{model_dir}"
|
64
|
-
|
65
|
-
predict = "#{exe} " \
|
66
|
-
"-t #{threads} " \
|
67
|
-
"-c predict " \
|
68
|
-
"-g #{vir_dir} " \
|
69
|
-
"-m #{model_dir} " \
|
70
|
-
"-r #{outdir} -b"
|
71
|
-
|
72
|
-
Process.run_and_time_it! "Building model", build_model
|
73
|
-
Process.run_and_time_it! "Predicting host", predict
|
74
|
-
|
75
|
-
FileUtils.rm_r model_dir if Dir.exist? model_dir
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
|
79
|
-
FileUtils.mkdir_p outdir
|
80
|
-
|
81
|
-
cmd = "python #{exe} " \
|
82
|
-
"-v #{vir_dir} " \
|
83
|
-
"-b #{host_dir} " \
|
84
|
-
"-o #{outdir} " \
|
85
|
-
"-d 1" # only compute d2star dissimilarity
|
86
|
-
|
87
|
-
Process.run_and_time_it! "Computing d2star dissimilarity", cmd
|
88
|
-
|
89
|
-
tmp_dir = File.join outdir, "tmp"
|
90
|
-
FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
|
91
|
-
|
92
|
-
bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
|
93
|
-
bad_files.each do |fname|
|
94
|
-
path = File.join outdir, fname
|
95
|
-
|
96
|
-
FileUtils.rm path if File.exist? path
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
23
|
+
WISH = File.join BIN, "WIsH"
|
24
|
+
VHM = File.join BIN, "vhm.py"
|
25
|
+
MUMMER = File.join BIN, "mummer"
|
26
|
+
RSCRIPT = "Rscript"
|
100
27
|
end
|
Binary file
|
data/vendor/bin/mac/vhm.py
CHANGED
@@ -46,8 +46,9 @@ if not os.path.exists(tmpDir) :
|
|
46
46
|
filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
|
47
47
|
|
48
48
|
## name length ##
|
49
|
-
nameLen = 93 - len(options.outDir)
|
49
|
+
# nameLen = 93 - len(options.outDir)
|
50
50
|
#### possibly because of the kmercount folder name for each contig is too long?
|
51
|
+
nameLen = 99999999999999
|
51
52
|
|
52
53
|
|
53
54
|
#################### 0: preparation ############################
|
@@ -211,7 +212,7 @@ if options.hostTaxaFile is None :
|
|
211
212
|
hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
|
212
213
|
hostTaxaFileWrite.close()
|
213
214
|
hostTaxaFileWrite = open(hostTaxaFile, 'a')
|
214
|
-
|
215
|
+
|
215
216
|
hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
|
216
217
|
for currentFileName in hostFaList :
|
217
218
|
if currentFileName.startswith('.') :
|
@@ -235,7 +236,7 @@ else :
|
|
235
236
|
hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
|
236
237
|
hostTaxaTable[hostTaxaTable=='']='unknown'
|
237
238
|
numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
|
238
|
-
|
239
|
+
|
239
240
|
filelog.flush()
|
240
241
|
|
241
242
|
#################### 1: count kmer and prepare list files ############################
|
@@ -259,7 +260,7 @@ for currentFileName in virusFaList :
|
|
259
260
|
filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
|
260
261
|
for w in range(1, (kmax+1)) :
|
261
262
|
currentFilePath = os.path.join(options.virusFaDir, currentFileName)
|
262
|
-
|
263
|
+
|
263
264
|
currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
|
264
265
|
cmdKmer = countKmerOut + " -l -k " + str(w) + \
|
265
266
|
" -i " + currentFilePath +\
|
@@ -278,9 +279,9 @@ for currentFileName in virusFaList :
|
|
278
279
|
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
279
280
|
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
280
281
|
sys.exit(0)
|
281
|
-
|
282
|
+
|
282
283
|
filelog.flush()
|
283
|
-
|
284
|
+
|
284
285
|
end_time = time.time()
|
285
286
|
count += 1
|
286
287
|
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
@@ -329,9 +330,9 @@ for currentFileName in hostFaList :
|
|
329
330
|
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
330
331
|
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
331
332
|
sys.exit(0)
|
332
|
-
|
333
|
+
|
333
334
|
filelog.flush()
|
334
|
-
|
335
|
+
|
335
336
|
end_time = time.time()
|
336
337
|
count += 1
|
337
338
|
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
@@ -366,7 +367,7 @@ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
|
|
366
367
|
sys.stdout.write(c.decode("utf-8"))
|
367
368
|
filelog.write(c.decode("utf-8"))
|
368
369
|
filelog.flush()
|
369
|
-
|
370
|
+
|
370
371
|
end_time = time.time()
|
371
372
|
count += 1
|
372
373
|
sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: big_simon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-07-
|
11
|
+
date: 2018-07-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,24 +52,66 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: parse_fasta
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.5'
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 2.5.2
|
65
|
+
type: :runtime
|
66
|
+
prerelease: false
|
67
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - "~>"
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '2.5'
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 2.5.2
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
76
|
name: rya
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
79
|
- - "~>"
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0.
|
81
|
+
version: 0.4.0
|
62
82
|
type: :runtime
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
86
|
- - "~>"
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0.
|
88
|
+
version: 0.4.0
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: trollop
|
91
|
+
requirement: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '2.1'
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 2.1.3
|
99
|
+
type: :runtime
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - "~>"
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '2.1'
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 2.1.3
|
69
109
|
description: Viral host discovery pipeline.
|
70
110
|
email:
|
71
111
|
- moorer@udel.edu
|
72
|
-
executables:
|
112
|
+
executables:
|
113
|
+
- big_simon
|
114
|
+
- ranks
|
73
115
|
extensions: []
|
74
116
|
extra_rdoc_files: []
|
75
117
|
files:
|
@@ -80,17 +122,25 @@ files:
|
|
80
122
|
- COPYING
|
81
123
|
- Gemfile
|
82
124
|
- Gemfile.lock
|
125
|
+
- Makefile
|
83
126
|
- README.md
|
84
127
|
- Rakefile
|
85
128
|
- big_simon.gemspec
|
86
129
|
- bin/console
|
87
130
|
- bin/setup
|
131
|
+
- exe/big_simon
|
132
|
+
- exe/ranks
|
88
133
|
- lib/big_simon.rb
|
134
|
+
- lib/big_simon/parsers.rb
|
135
|
+
- lib/big_simon/pipeline.rb
|
136
|
+
- lib/big_simon/runners.rb
|
137
|
+
- lib/big_simon/utils.rb
|
89
138
|
- lib/big_simon/version.rb
|
90
139
|
- vendor/bin/mac/WIsH
|
91
140
|
- vendor/bin/mac/computeMeasure.out
|
92
141
|
- vendor/bin/mac/computeMeasure_onlyd2star.out
|
93
142
|
- vendor/bin/mac/countKmer.out
|
143
|
+
- vendor/bin/mac/mummer
|
94
144
|
- vendor/bin/mac/vhm.py
|
95
145
|
homepage: https://github.com/mooreryan/big_simon
|
96
146
|
licenses: []
|