big_simon 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +13 -1
- data/Gemfile.lock +7 -3
- data/Makefile +8 -0
- data/README.md +8 -2
- data/big_simon.gemspec +3 -1
- data/exe/big_simon +244 -0
- data/exe/ranks +93 -0
- data/lib/big_simon/parsers.rb +78 -0
- data/lib/big_simon/pipeline.rb +64 -0
- data/lib/big_simon/runners.rb +189 -0
- data/lib/big_simon/utils.rb +108 -0
- data/lib/big_simon/version.rb +13 -1
- data/lib/big_simon.rb +14 -87
- data/vendor/bin/mac/mummer +0 -0
- data/vendor/bin/mac/vhm.py +10 -9
- metadata +55 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
|
4
|
+
data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
|
7
|
+
data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
big_simon (0.0
|
5
|
-
|
4
|
+
big_simon (0.1.0)
|
5
|
+
parse_fasta (~> 2.5, >= 2.5.2)
|
6
|
+
rya (~> 0.4.0)
|
7
|
+
trollop (~> 2.1, >= 2.1.3)
|
6
8
|
|
7
9
|
GEM
|
8
10
|
remote: https://rubygems.org/
|
9
11
|
specs:
|
10
12
|
abort_if (0.2.0)
|
11
13
|
diff-lcs (1.3)
|
14
|
+
parse_fasta (2.5.2)
|
12
15
|
rake (10.5.0)
|
13
16
|
rspec (3.7.0)
|
14
17
|
rspec-core (~> 3.7.0)
|
@@ -23,10 +26,11 @@ GEM
|
|
23
26
|
diff-lcs (>= 1.2.0, < 2.0)
|
24
27
|
rspec-support (~> 3.7.0)
|
25
28
|
rspec-support (3.7.1)
|
26
|
-
rya (0.
|
29
|
+
rya (0.4.0)
|
27
30
|
abort_if (~> 0.2.0)
|
28
31
|
systemu (~> 2.6, >= 2.6.5)
|
29
32
|
systemu (2.6.5)
|
33
|
+
trollop (2.1.3)
|
30
34
|
|
31
35
|
PLATFORMS
|
32
36
|
ruby
|
data/Makefile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
test_small:
|
2
|
+
rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
3
|
+
|
4
|
+
test_small_install:
|
5
|
+
rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
6
|
+
|
7
|
+
test_toy:
|
8
|
+
rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# BigSimon
|
2
2
|
|
3
|
-
|
3
|
+
Hi, I'm BigSimon (but you can call me BigSi if you want), and I'm a pipeline for finding hosts of viruses!
|
4
|
+
|
5
|
+
Mainly, I'm just a wrapper for some other nice tools.
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -20,7 +22,11 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
## Usage
|
22
24
|
|
23
|
-
|
25
|
+
For now, there is not much documentation. To see the help file, run:
|
26
|
+
|
27
|
+
```
|
28
|
+
$ big_simon --help
|
29
|
+
```
|
24
30
|
|
25
31
|
## Development
|
26
32
|
|
data/big_simon.gemspec
CHANGED
@@ -26,5 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
27
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
28
|
|
29
|
-
spec.add_runtime_dependency "
|
29
|
+
spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
|
30
|
+
spec.add_runtime_dependency "rya", "~> 0.4.0"
|
31
|
+
spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
|
30
32
|
end
|
data/exe/big_simon
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Signal.trap("PIPE", "EXIT")
|
3
|
+
|
4
|
+
require "pp"
|
5
|
+
require "tempfile"
|
6
|
+
|
7
|
+
require "parse_fasta"
|
8
|
+
require "trollop"
|
9
|
+
|
10
|
+
require "big_simon"
|
11
|
+
|
12
|
+
# TODO make scaled scores with high score being better.
|
13
|
+
|
14
|
+
Process.extend Rya::CoreExtensions::Process
|
15
|
+
|
16
|
+
opts = Trollop.options do
|
17
|
+
version BigSimon::VERSION_BANNER
|
18
|
+
|
19
|
+
banner <<-EOS
|
20
|
+
|
21
|
+
#{BigSimon::VERSION_BANNER}
|
22
|
+
|
23
|
+
Hi, I'm BigSimon! I'm here to help you figure out the hosts for
|
24
|
+
your viruses.
|
25
|
+
|
26
|
+
I run a bunch of different programs. In addition to doing some
|
27
|
+
merging of results, I'll give you heatmaps for all the programs and
|
28
|
+
you can check for yourself.
|
29
|
+
|
30
|
+
The scaled scores run from 0 to 1 with lower scores being better.
|
31
|
+
|
32
|
+
Options:
|
33
|
+
EOS
|
34
|
+
|
35
|
+
opt :viruses, "Path to fasta file(s) with viruses", type: :strings
|
36
|
+
opt :hosts, "Path to fasta file(s) with hosts", type: :strings
|
37
|
+
opt :outdir, "Output directory", default: "big_simon"
|
38
|
+
opt :threads, "Number of threads to use", default: 1
|
39
|
+
end
|
40
|
+
|
41
|
+
Rya::AbortIf.logger.debug { "Command line opts: #{opts.inspect}" }
|
42
|
+
|
43
|
+
BigSimon::Utils.check_opt! opts, :viruses
|
44
|
+
BigSimon::Utils.check_opt! opts, :hosts
|
45
|
+
|
46
|
+
# Check infiles
|
47
|
+
[opts[:viruses], opts[:hosts]].flatten.each do |fname|
|
48
|
+
BigSimon::Utils.check_file! fname
|
49
|
+
end
|
50
|
+
|
51
|
+
Rya::AbortIf.abort_unless opts[:threads] > 0,
|
52
|
+
"--threads must be > 0"
|
53
|
+
|
54
|
+
programs = [
|
55
|
+
"WIsH",
|
56
|
+
"VirHostMatcher",
|
57
|
+
"mummer"
|
58
|
+
]
|
59
|
+
|
60
|
+
outdir = opts[:outdir]
|
61
|
+
threads = opts[:threads]
|
62
|
+
virus_fnames = opts[:viruses]
|
63
|
+
host_fnames = opts[:hosts]
|
64
|
+
|
65
|
+
FileUtils.mkdir_p outdir
|
66
|
+
|
67
|
+
tmpdir = File.join opts[:outdir], "big_simon_tmp"
|
68
|
+
tmpdir_virus = File.join tmpdir, "virus"
|
69
|
+
tmpdir_host = File.join tmpdir, "host"
|
70
|
+
|
71
|
+
# all_predictions_fname = File.join outdir, "scores_all.txt"
|
72
|
+
mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
|
73
|
+
|
74
|
+
virus_recs, host_recs = [], []
|
75
|
+
|
76
|
+
# Tempfile.open do |vir_f|
|
77
|
+
# Tempfile.open do |host_f|
|
78
|
+
# virus_fnames.each do |fname|
|
79
|
+
# ParseFasta::SeqFile.open(fname).each_record do |rec|
|
80
|
+
# vir_f.puts rec
|
81
|
+
#
|
82
|
+
# vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
|
83
|
+
# end
|
84
|
+
# end
|
85
|
+
#
|
86
|
+
# host_fnames.each do |fname|
|
87
|
+
# ParseFasta::SeqFile.open(fname).each_record do |rec|
|
88
|
+
# host_f.puts rec
|
89
|
+
# host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
|
90
|
+
# end
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# vir_f.fsync
|
94
|
+
# host_f.fsync
|
95
|
+
#
|
96
|
+
# cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
|
97
|
+
# Process.run_and_time_it! "MUMMER", cmd
|
98
|
+
# end
|
99
|
+
# end
|
100
|
+
#
|
101
|
+
# header = nil
|
102
|
+
# hits = []
|
103
|
+
# hit_info = {}
|
104
|
+
# virus = nil
|
105
|
+
#
|
106
|
+
# File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
|
107
|
+
# if line.start_with? '>'
|
108
|
+
# virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
109
|
+
#
|
110
|
+
# unless hit_info.has_key? virus
|
111
|
+
# hit_info[virus] = {}
|
112
|
+
# end
|
113
|
+
# else
|
114
|
+
# host, _, _, len = line.chomp.strip.split(" ")
|
115
|
+
# host = host.sub(/___reverse$/, "").strip
|
116
|
+
#
|
117
|
+
# unless hit_info[virus].has_key? host
|
118
|
+
# hit_info[virus][host] = -1
|
119
|
+
# end
|
120
|
+
#
|
121
|
+
# hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
|
122
|
+
# end
|
123
|
+
# end
|
124
|
+
#
|
125
|
+
# puts
|
126
|
+
#
|
127
|
+
# hh = hit_info.map do |virus, info|
|
128
|
+
# [virus, info.to_a.sort_by {|gen, len| len}.reverse]
|
129
|
+
# end
|
130
|
+
#
|
131
|
+
# pp hh
|
132
|
+
|
133
|
+
# hh = hit_info.map do |virus, info|
|
134
|
+
# [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
|
135
|
+
#
|
136
|
+
# end
|
137
|
+
# p hit_info
|
138
|
+
|
139
|
+
scores_files = {}
|
140
|
+
programs.each do |program|
|
141
|
+
raw_fname = File.join outdir, "scores_raw.#{program}.txt"
|
142
|
+
scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
|
143
|
+
|
144
|
+
scores_files[program] = {
|
145
|
+
raw: File.open(raw_fname, "w"),
|
146
|
+
scaled: File.open(scaled_fname, "w")
|
147
|
+
}
|
148
|
+
end
|
149
|
+
|
150
|
+
scores_files.each do |program, files|
|
151
|
+
files.each do |name, file|
|
152
|
+
file.puts %w[virus host score].join "\t"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
|
157
|
+
name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
|
158
|
+
|
159
|
+
wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
|
160
|
+
vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
|
161
|
+
|
162
|
+
host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
|
163
|
+
|
164
|
+
# Map them back to simple names. TODO just have it spit these out from the beginning.
|
165
|
+
host_info_mummer_simple_names = {}
|
166
|
+
inverted_name_map_virus = name_map_virus.invert
|
167
|
+
inverted_name_map_host = name_map_host.invert
|
168
|
+
|
169
|
+
host_info_mummer.each do |virus, host_tables|
|
170
|
+
virname = virus
|
171
|
+
if inverted_name_map_virus.has_key? virus
|
172
|
+
virname = inverted_name_map_virus[virus]
|
173
|
+
end
|
174
|
+
|
175
|
+
host_info_mummer_simple_names[virname] = []
|
176
|
+
|
177
|
+
host_tables.map do |table|
|
178
|
+
hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
|
179
|
+
new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
|
180
|
+
|
181
|
+
host_info_mummer_simple_names[virname] << new_table
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
host_info_wish = BigSimon::Parsers.wish wish_outf
|
186
|
+
host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
|
187
|
+
|
188
|
+
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
|
189
|
+
host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
|
190
|
+
|
191
|
+
# puts
|
192
|
+
# pp host_info
|
193
|
+
# puts
|
194
|
+
|
195
|
+
# Just a basic all info file
|
196
|
+
# File.open all_predictions_fname, "w" do |f|
|
197
|
+
# f.puts %w[virus host program score scaled.score].join "\t"
|
198
|
+
|
199
|
+
host_info.each do |virus, h1|
|
200
|
+
h1.each do |host, h2|
|
201
|
+
lines = {}
|
202
|
+
|
203
|
+
h2[:scores].each do |program, score|
|
204
|
+
lines[[virus, host, program]] = [score]
|
205
|
+
|
206
|
+
scores_files[program][:raw].puts [virus, host, score].join "\t"
|
207
|
+
end
|
208
|
+
|
209
|
+
# Add in the scaled score too.
|
210
|
+
h2[:scaled_scores].each do |program, score|
|
211
|
+
lines[[virus, host, program]] << score
|
212
|
+
|
213
|
+
scores_files[program][:scaled].puts [virus, host, score].join "\t"
|
214
|
+
end
|
215
|
+
|
216
|
+
# lines.each do |(virus, host, program), (score, scaled_score)|
|
217
|
+
# f.puts [virus, host, program, score, scaled_score].join "\t"
|
218
|
+
# end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
# end
|
222
|
+
|
223
|
+
# A file with mean scaled scores.
|
224
|
+
File.open mean_scaled_scores_fname, "w" do |f|
|
225
|
+
f.puts %w[virus host score].join "\t"
|
226
|
+
|
227
|
+
host_info.each do |virus, h1|
|
228
|
+
h1.each do |host, h2|
|
229
|
+
scaled_scores = h2[:scaled_scores].values
|
230
|
+
|
231
|
+
mean_scaled_score = scaled_scores.reduce(:+) / scaled_scores.length.to_f
|
232
|
+
|
233
|
+
f.puts [virus, host, mean_scaled_score].join "\t"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
scores_files.each do |program, file|
|
239
|
+
file.values.map(&:close)
|
240
|
+
end
|
241
|
+
FileUtils.rm_r tmpdir
|
242
|
+
|
243
|
+
# Make the heatmaps
|
244
|
+
BigSimon::Runners.heatmaps BigSimon::RSCRIPT, outdir, File.join(outdir, "heatmaps")
|
data/exe/ranks
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Signal.trap("PIPE", "EXIT")
|
3
|
+
|
4
|
+
require "pp"
|
5
|
+
|
6
|
+
by_program = {}
|
7
|
+
lines = []
|
8
|
+
|
9
|
+
TOP = (ENV["TOP"] || 5).to_i
|
10
|
+
|
11
|
+
ARGV.each do |fname|
|
12
|
+
scores = {}
|
13
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
14
|
+
unless idx.zero?
|
15
|
+
virus, host, score = line.chomp.split "\t"
|
16
|
+
|
17
|
+
unless scores.has_key? virus
|
18
|
+
scores[virus] = []
|
19
|
+
end
|
20
|
+
|
21
|
+
scores[virus] << [host, score.to_f]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
scores.sort_by { |virus, _| virus }.each do |virus, host_scores|
|
26
|
+
# Lowest score is the best
|
27
|
+
top_5 = host_scores.sort_by { |host, score| score }.take(TOP).map(&:first)
|
28
|
+
|
29
|
+
line = [File.basename(fname), virus, top_5]
|
30
|
+
lines << line
|
31
|
+
|
32
|
+
# puts line.join "\t"
|
33
|
+
end
|
34
|
+
# puts
|
35
|
+
end
|
36
|
+
|
37
|
+
lines.each do |line|
|
38
|
+
program, virus, all = line
|
39
|
+
first = all.first
|
40
|
+
|
41
|
+
unless by_program.has_key? virus
|
42
|
+
by_program[virus] = {}
|
43
|
+
end
|
44
|
+
|
45
|
+
unless program == "scores_scaled.mean.txt"
|
46
|
+
by_program[virus][program] = { first: first, all: all }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# These track the number of times a host shows up in the first spot and in the top N spots for that virus for all programs.
|
51
|
+
first_table = {}
|
52
|
+
top_host_table = {}
|
53
|
+
|
54
|
+
by_program.each do |virus, program_tables|
|
55
|
+
first_table[virus] = {}
|
56
|
+
top_host_table[virus] = {}
|
57
|
+
|
58
|
+
program_tables.each do |program, top_info|
|
59
|
+
|
60
|
+
|
61
|
+
first_host = top_info[:first]
|
62
|
+
all_top = top_info[:all]
|
63
|
+
|
64
|
+
unless first_table[virus].has_key? first_host
|
65
|
+
first_table[virus][first_host] = []
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
first_table[virus][first_host] << program
|
70
|
+
|
71
|
+
all_top.each do |top_host|
|
72
|
+
unless top_host_table[virus].has_key? top_host
|
73
|
+
top_host_table[virus][top_host] = []
|
74
|
+
end
|
75
|
+
|
76
|
+
top_host_table[virus][top_host] << program
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
first_table.each do |virus, host_counts|
|
82
|
+
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
83
|
+
STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
|
84
|
+
end
|
85
|
+
STDERR.puts
|
86
|
+
end
|
87
|
+
|
88
|
+
top_host_table.each do |virus, host_counts|
|
89
|
+
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
90
|
+
puts [virus, :top_N, host, programs.count, programs].join "\t"
|
91
|
+
end
|
92
|
+
puts
|
93
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module BigSimon
|
2
|
+
# Methods for parsing output files
|
3
|
+
class Parsers
|
4
|
+
|
5
|
+
# @note VirHostMatcher returns true distances that run from 0 to 1, so it doesn't need scaling.
|
6
|
+
# @note VirHostMatcher includes the whole file name as the id of the organism, so we chop off some common endings.
|
7
|
+
def self.vir_host_matcher fname
|
8
|
+
hosts = nil
|
9
|
+
|
10
|
+
host_info = {}
|
11
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
12
|
+
line.chomp!
|
13
|
+
line.sub! /,$/, "" # git rid of trailing commas
|
14
|
+
|
15
|
+
if idx.zero?
|
16
|
+
stat, *hosts = line.split ","
|
17
|
+
|
18
|
+
hosts.map! { |str| BigSimon::Utils.strip_suffix str }
|
19
|
+
else
|
20
|
+
ary = line.split ","
|
21
|
+
virus = BigSimon::Utils.strip_suffix ary.shift
|
22
|
+
|
23
|
+
# In this case the best value is the lowest distance.
|
24
|
+
dists = ary.map.
|
25
|
+
with_index do |dist, idx|
|
26
|
+
{ host: hosts[idx], score: dist.to_f, scaled_score: dist.to_f }
|
27
|
+
end.sort_by { |ht| ht[:scaled_score] }
|
28
|
+
|
29
|
+
|
30
|
+
host_info[virus] = dists
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
host_info
|
35
|
+
end
|
36
|
+
|
37
|
+
# @note WIsH gives log likelihoods so the scaled value is actually scaled.
|
38
|
+
# @note The viruses and hosts will have the ID rather than the file name.
|
39
|
+
def self.wish fname
|
40
|
+
viruses = nil
|
41
|
+
|
42
|
+
host_info = {}
|
43
|
+
|
44
|
+
hosts = nil
|
45
|
+
File.open(fname, "rt").each_line.map.with_index do |line, idx|
|
46
|
+
line.chomp!
|
47
|
+
|
48
|
+
if idx.zero?
|
49
|
+
ary = line.split("\t")
|
50
|
+
ary.unshift("")
|
51
|
+
else
|
52
|
+
ary = line.split("\t")
|
53
|
+
end
|
54
|
+
end.transpose.each_with_index do |line_ary, idx|
|
55
|
+
if idx.zero?
|
56
|
+
hosts = line_ary.drop(1)
|
57
|
+
else
|
58
|
+
virus = line_ary.shift
|
59
|
+
|
60
|
+
scores = line_ary.map(&:to_f)
|
61
|
+
|
62
|
+
host_vals = scores.map.with_index do |score, idx|
|
63
|
+
{ host: hosts[idx], score: score, scaled_score: 1 - Math.exp(score) }
|
64
|
+
end
|
65
|
+
|
66
|
+
host_info[virus] = host_vals
|
67
|
+
end
|
68
|
+
|
69
|
+
host_info.each do |virus, hosts|
|
70
|
+
hosts.sort_by! { |ht| ht[:scaled_score] }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
host_info
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module BigSimon
|
2
|
+
class Pipeline
|
3
|
+
# @param collated_results_table { virus => host => score_type => program => score }
|
4
|
+
def self.map_taxa collated_results_table, virus_name_map, host_name_map
|
5
|
+
new_results_table = {}
|
6
|
+
|
7
|
+
collated_results_table.each do |virus_name, host_table|
|
8
|
+
if virus_name_map.include? virus_name
|
9
|
+
new_virus_name = virus_name_map[virus_name]
|
10
|
+
else
|
11
|
+
new_virus_name = virus_name
|
12
|
+
end
|
13
|
+
|
14
|
+
new_results_table[new_virus_name] = {}
|
15
|
+
|
16
|
+
host_table.each do |host_name, score_table|
|
17
|
+
if host_name_map.include? host_name
|
18
|
+
new_host_name = host_name_map[host_name]
|
19
|
+
else
|
20
|
+
new_host_name = host_name
|
21
|
+
end
|
22
|
+
|
23
|
+
new_results_table[new_virus_name][new_host_name] = score_table
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
new_results_table
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param [Array<Hash>] results_table host info hash tables. See functions in Parsers class.
|
31
|
+
# @param [Array<String>] programs names of programs generating hash tables (in same order as host_data)
|
32
|
+
def self.collate_host_results results_table, programs
|
33
|
+
Rya::AbortIf.assert results_table.count == programs.count
|
34
|
+
|
35
|
+
virus_host_scores = {}
|
36
|
+
all_viruses = results_table.reduce(Set.new) { |acc, ht| acc + ht.keys }
|
37
|
+
|
38
|
+
all_viruses.each do |virus|
|
39
|
+
virus_host_scores[virus] = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
results_table.each_with_index do |ht, idx|
|
43
|
+
program = programs[idx]
|
44
|
+
|
45
|
+
ht.each do |virus, host_scores|
|
46
|
+
host_scores.each do |ht|
|
47
|
+
host = ht[:host]
|
48
|
+
score = ht[:score]
|
49
|
+
scaled_score = ht[:scaled_score]
|
50
|
+
|
51
|
+
unless virus_host_scores[virus].has_key? host
|
52
|
+
virus_host_scores[virus][host] = { scores: {}, scaled_scores: {}}
|
53
|
+
end
|
54
|
+
|
55
|
+
virus_host_scores[virus][host][:scores][program] = score
|
56
|
+
virus_host_scores[virus][host][:scaled_scores][program] = scaled_score
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
virus_host_scores
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
|
3
|
+
module BigSimon
|
4
|
+
class Runners
|
5
|
+
|
6
|
+
# This one's a bit different as it parses as well and returns original names.
|
7
|
+
# @todo Also do the reverse of each genome in case it's a contig.
|
8
|
+
def self.mummer exe, vir_dir, host_dir, outdir, threads
|
9
|
+
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
|
+
FileUtils.mkdir_p outdir
|
11
|
+
|
12
|
+
# TODO put these all in one file then do it?
|
13
|
+
|
14
|
+
results = {}
|
15
|
+
|
16
|
+
# Takes names in files and puts them to the file names
|
17
|
+
name_map = {}
|
18
|
+
|
19
|
+
Dir.glob(vir_dir + "/*").each do |vir_fname|
|
20
|
+
this_virus_scores = []
|
21
|
+
virus = nil
|
22
|
+
|
23
|
+
Dir.glob(host_dir + "/*").each do |host_fname|
|
24
|
+
vir_base = File.basename vir_fname
|
25
|
+
host_base = File.basename host_fname
|
26
|
+
outfname = File.join outdir, "#{vir_base}___#{host_base}.mummer"
|
27
|
+
|
28
|
+
# -l is min length of a match TODO pull this into a const
|
29
|
+
# -F to force 4 columns
|
30
|
+
cmd = "#{exe} -F " \
|
31
|
+
"-maxmatch " \
|
32
|
+
"-l 15 " \
|
33
|
+
"#{host_fname} " \
|
34
|
+
"#{vir_fname} " \
|
35
|
+
"> #{outfname}"
|
36
|
+
|
37
|
+
Process.run_and_time_it! "Calculating matches", cmd
|
38
|
+
|
39
|
+
# Note there should only be one '>' per file here.
|
40
|
+
host = nil
|
41
|
+
score = 0
|
42
|
+
File.open(outfname, "rt").each_line.with_index do |line, idx|
|
43
|
+
if idx.zero?
|
44
|
+
this_virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
45
|
+
|
46
|
+
Rya::AbortIf::abort_unless(this_virus == virus, "OOPS") if virus
|
47
|
+
|
48
|
+
virus ||= this_virus
|
49
|
+
else
|
50
|
+
ary = line.chomp.strip.split(" ")
|
51
|
+
Rya::AbortIf.abort_unless ary.count == 4, "Problem parsing #{outfname} (mummer output)"
|
52
|
+
|
53
|
+
host = ary[0].sub(/___reverse$/, "").strip
|
54
|
+
len = ary[3].to_i
|
55
|
+
|
56
|
+
score = len if len > score
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
this_virus_scores << score
|
61
|
+
|
62
|
+
unless results.has_key? virus
|
63
|
+
results[virus] = []
|
64
|
+
end
|
65
|
+
|
66
|
+
results[virus] << { host: host, score: score, scaled_score: nil }
|
67
|
+
|
68
|
+
FileUtils.rm outfname
|
69
|
+
end
|
70
|
+
|
71
|
+
# This was the original scaling, i.e. per virus
|
72
|
+
# min = 0 # this_virus_scores.min # Technically, this should range from 0 to 15. Any data missing from this table would give a zero. TODO we don't actually account for this though.
|
73
|
+
# max = this_virus_scores.max
|
74
|
+
# from = 1
|
75
|
+
# to = 0
|
76
|
+
#
|
77
|
+
# results[virus].each do |host_table|
|
78
|
+
# host_table[:scaled_score] = klass.scale host_table[:score], min, max, from, to
|
79
|
+
# end
|
80
|
+
end
|
81
|
+
|
82
|
+
all_scores = []
|
83
|
+
results.each do |virus, host_tables|
|
84
|
+
all_scores << host_tables.map { |table| table[:score] }
|
85
|
+
end
|
86
|
+
|
87
|
+
all_scores.flatten!
|
88
|
+
max = all_scores.max
|
89
|
+
|
90
|
+
results.each do |virus, host_tables|
|
91
|
+
host_tables.each do |host_table|
|
92
|
+
host_table[:scaled_score] = klass.scale host_table[:score], 0, max, 1, 0
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
results
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
|
100
|
+
FileUtils.mkdir_p outdir
|
101
|
+
|
102
|
+
cmd = "python #{exe} " \
|
103
|
+
"-v #{vir_dir} " \
|
104
|
+
"-b #{host_dir} " \
|
105
|
+
"-o #{outdir} " \
|
106
|
+
"-d 1" # only compute d2star dissimilarity
|
107
|
+
|
108
|
+
Process.run_and_time_it! "Computing d2star dissimilarity", cmd
|
109
|
+
|
110
|
+
tmp_dir = File.join outdir, "tmp"
|
111
|
+
FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
|
112
|
+
|
113
|
+
bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
|
114
|
+
bad_files.each do |fname|
|
115
|
+
path = File.join outdir, fname
|
116
|
+
|
117
|
+
FileUtils.rm path if File.exist? path
|
118
|
+
end
|
119
|
+
|
120
|
+
outf = File.join outdir, "d2star_k6.csv"
|
121
|
+
new_outf = File.join outdir, "vir_host_matcher.txt"
|
122
|
+
FileUtils.mv outf, new_outf
|
123
|
+
|
124
|
+
new_outf
|
125
|
+
end
|
126
|
+
|
127
|
+
# Runs the WIsH program
|
128
|
+
#
|
129
|
+
# @raise [AbortIf::Exit] if commands fail
|
130
|
+
def self.wish exe, vir_dir, host_dir, outdir, threads
|
131
|
+
model_dir = File.join outdir, "model"
|
132
|
+
|
133
|
+
FileUtils.mkdir_p model_dir
|
134
|
+
|
135
|
+
build_model = "#{exe} " \
|
136
|
+
"-t #{threads} " \
|
137
|
+
"-c build " \
|
138
|
+
"-g #{host_dir} " \
|
139
|
+
"-m #{model_dir}"
|
140
|
+
|
141
|
+
predict = "#{exe} " \
|
142
|
+
"-t #{threads} " \
|
143
|
+
"-c predict " \
|
144
|
+
"-g #{vir_dir} " \
|
145
|
+
"-m #{model_dir} " \
|
146
|
+
"-r #{outdir}"
|
147
|
+
|
148
|
+
Process.run_and_time_it! "Building model", build_model
|
149
|
+
Process.run_and_time_it! "Predicting host", predict
|
150
|
+
|
151
|
+
FileUtils.rm_r model_dir if Dir.exist? model_dir
|
152
|
+
|
153
|
+
outf = File.join outdir, "llikelihood.matrix"
|
154
|
+
new_outf = File.join outdir, "wish.txt"
|
155
|
+
FileUtils.mv outf, new_outf
|
156
|
+
|
157
|
+
new_outf
|
158
|
+
end
|
159
|
+
|
160
|
+
def self.heatmaps exe, indir, outdir
|
161
|
+
FileUtils.mkdir_p outdir
|
162
|
+
|
163
|
+
fnames = Dir.glob("#{indir}/scores*.txt").map do |in_fname|
|
164
|
+
extname = File.extname in_fname
|
165
|
+
basename = File.basename in_fname, extname
|
166
|
+
|
167
|
+
out_fname = File.join outdir, "#{basename}.heatmap.pdf"
|
168
|
+
|
169
|
+
[in_fname, out_fname]
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
rcode_str = BigSimon::Utils.rcode fnames
|
174
|
+
|
175
|
+
Object::Tempfile.open do |f|
|
176
|
+
f.puts rcode_str
|
177
|
+
f.fsync # ensure no data is buffered
|
178
|
+
|
179
|
+
|
180
|
+
cmd = "#{exe} #{f.path}"
|
181
|
+
Process.run_and_time_it! "Drawing heatmaps", cmd
|
182
|
+
end
|
183
|
+
|
184
|
+
out_fnames = fnames.map(&:last)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module BigSimon
|
2
|
+
# @todo These don't have unit tests yet.
|
3
|
+
# @note Skips any duplicate IDs. Only keeps the first one.
|
4
|
+
class Utils
|
5
|
+
def self.check_file! fname
|
6
|
+
Rya::AbortIf.abort_if fname && !File.exist?(fname),
|
7
|
+
"#{fname} doesn't exist! Try big_simon --help for help."
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.check_opt! opts, arg
|
11
|
+
Rya::AbortIf.abort_unless opts.send(:fetch, "#{arg}_given".to_sym),
|
12
|
+
"You must specify --#{arg.to_s.tr('_', '-')}. Try big_simon --help for help."
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.rcode fnames
|
16
|
+
functions = %Q|
|
17
|
+
library(reshape2)
|
18
|
+
library(gplots)
|
19
|
+
library(RColorBrewer)
|
20
|
+
|
21
|
+
file.join <- function(...) {
|
22
|
+
paste(..., sep="/")
|
23
|
+
}
|
24
|
+
|
25
|
+
draw.heatmap <- function(infname, outfname) {
|
26
|
+
dat <- read.table(infname, header=T, sep="\t")
|
27
|
+
|
28
|
+
wide.dat <- dcast(dat, host ~ virus, value.var="score")
|
29
|
+
|
30
|
+
hosts <- wide.dat[, 1]
|
31
|
+
scores <- wide.dat[, 2:ncol(wide.dat)]
|
32
|
+
scores.numeric <- apply(scores, 2, as.numeric)
|
33
|
+
|
34
|
+
scores.matrix <- as.matrix(scores.numeric)
|
35
|
+
|
36
|
+
rownames(scores.matrix) <- hosts
|
37
|
+
|
38
|
+
palette <- "YlOrBr"
|
39
|
+
col <- colorRampPalette(brewer.pal(n=9, palette))(n = 25)
|
40
|
+
size <- 0.75
|
41
|
+
|
42
|
+
pdf(outfname, height=5, width=8)
|
43
|
+
|
44
|
+
heatmap.2(scores.matrix,
|
45
|
+
trace="none", ## Disable those wonky lines.
|
46
|
+
col=col, ## Set the color.
|
47
|
+
|
48
|
+
## Size opts
|
49
|
+
margins=c(11, 11), cexRow=size, cexCol=size,
|
50
|
+
|
51
|
+
## Key labeling
|
52
|
+
key.xlab="Score")
|
53
|
+
|
54
|
+
invisible(dev.off())
|
55
|
+
}
|
56
|
+
|
57
|
+
|
|
58
|
+
|
59
|
+
drawing = fnames.map do |in_fname, out_fname|
|
60
|
+
%Q{
|
61
|
+
|
62
|
+
draw.heatmap("#{in_fname}", "#{out_fname}")
|
63
|
+
}
|
64
|
+
end.join
|
65
|
+
|
66
|
+
[functions, drawing].join "\n"
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.scale_log_likelihood ll
|
70
|
+
1 - Math.exp(ll)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.set_up_tmp_dirs fastas, tmpdir, which
|
74
|
+
Object::FileUtils.mkdir_p tmpdir
|
75
|
+
|
76
|
+
name_map = {}
|
77
|
+
all_ids = Set.new
|
78
|
+
|
79
|
+
seq_num = -1
|
80
|
+
fastas.each do |fname|
|
81
|
+
ParseFasta::SeqFile.open(fname).each_record do |rec|
|
82
|
+
if all_ids.include? rec.id
|
83
|
+
Rya::AbortIf.logger.warn { "#{rec.id} was seen more than one time! Duplicate organism IDs are not allowed, so we will only keep the first one." }
|
84
|
+
else
|
85
|
+
all_ids << rec.id
|
86
|
+
|
87
|
+
seq_num += 1
|
88
|
+
|
89
|
+
new_id = "#{which}_#{seq_num}"
|
90
|
+
name_map[new_id] = rec.id
|
91
|
+
|
92
|
+
outfname = File.join tmpdir, "#{new_id}.fa"
|
93
|
+
|
94
|
+
File.open(outfname, "w") do |f|
|
95
|
+
f.puts rec
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
[name_map, all_ids]
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.strip_suffix fname
|
105
|
+
fname.sub /.fasta$|.fa$/, ""
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/big_simon/version.rb
CHANGED
@@ -1,3 +1,15 @@
|
|
1
1
|
module BigSimon
|
2
|
-
VERSION = "0.0
|
2
|
+
VERSION = "0.1.0"
|
3
|
+
|
4
|
+
COPYRIGHT = "2018 Ryan Moore"
|
5
|
+
CONTACT = "moorer@udel.edu"
|
6
|
+
WEBSITE = "https://github.com/mooreryan/InteinFinder"
|
7
|
+
LICENSE = "GPLv3"
|
8
|
+
|
9
|
+
VERSION_BANNER =
|
10
|
+
" # Version: v#{VERSION}
|
11
|
+
# Copyright: #{COPYRIGHT}
|
12
|
+
# Contact: #{CONTACT}
|
13
|
+
# License: #{LICENSE}"
|
14
|
+
|
3
15
|
end
|
data/lib/big_simon.rb
CHANGED
@@ -1,100 +1,27 @@
|
|
1
1
|
require "rya"
|
2
|
+
require "set"
|
3
|
+
require "pp"
|
2
4
|
|
3
5
|
require "big_simon/version"
|
4
6
|
|
7
|
+
require "big_simon/utils"
|
8
|
+
|
9
|
+
require "big_simon/runners"
|
10
|
+
require "big_simon/parsers"
|
11
|
+
require "big_simon/pipeline"
|
12
|
+
|
5
13
|
Time.extend Rya::CoreExtensions::Time
|
6
14
|
Process.extend Rya::CoreExtensions::Process
|
15
|
+
Array.include Rya::CoreExtensions::Array
|
16
|
+
Math.extend Rya::CoreExtensions::Math
|
7
17
|
|
8
18
|
module BigSimon
|
9
|
-
|
10
|
-
# Project directories
|
11
19
|
ROOT = File.join __dir__, ".."
|
12
20
|
BIN = File.join ROOT, "vendor", "bin", "mac"
|
13
21
|
SPEC = File.join ROOT, "spec"
|
14
22
|
TEST_FILES = File.join SPEC, "test_files"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
hosts = nil
|
20
|
-
|
21
|
-
host_info = {}
|
22
|
-
File.open(fname, "rt").each_line.with_index do |line, idx|
|
23
|
-
line.chomp!
|
24
|
-
line.sub! /,$/, "" # git rid of trailing commas
|
25
|
-
|
26
|
-
if idx.zero?
|
27
|
-
stat, *hosts = line.split ","
|
28
|
-
else
|
29
|
-
ary = line.split ","
|
30
|
-
virus = ary.shift
|
31
|
-
|
32
|
-
dists = ary.map.
|
33
|
-
with_index { |dist, idx| [hosts[idx], dist.to_f] }.
|
34
|
-
sort_by { |_, dist| dist }
|
35
|
-
|
36
|
-
best_host = dists[0][0]
|
37
|
-
|
38
|
-
host_info[virus] = {
|
39
|
-
best: best_host,
|
40
|
-
all: dists
|
41
|
-
}
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
host_info
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
class Runners
|
50
|
-
|
51
|
-
# Runs the WIsH program
|
52
|
-
#
|
53
|
-
# @raise [AbortIf::Exit] if commands fail
|
54
|
-
def self.wish exe, vir_dir, host_dir, outdir, threads
|
55
|
-
model_dir = File.join outdir, "model"
|
56
|
-
|
57
|
-
FileUtils.mkdir_p model_dir
|
58
|
-
|
59
|
-
build_model = "#{exe} " \
|
60
|
-
"-t #{threads} " \
|
61
|
-
"-c build " \
|
62
|
-
"-g #{host_dir} " \
|
63
|
-
"-m #{model_dir}"
|
64
|
-
|
65
|
-
predict = "#{exe} " \
|
66
|
-
"-t #{threads} " \
|
67
|
-
"-c predict " \
|
68
|
-
"-g #{vir_dir} " \
|
69
|
-
"-m #{model_dir} " \
|
70
|
-
"-r #{outdir} -b"
|
71
|
-
|
72
|
-
Process.run_and_time_it! "Building model", build_model
|
73
|
-
Process.run_and_time_it! "Predicting host", predict
|
74
|
-
|
75
|
-
FileUtils.rm_r model_dir if Dir.exist? model_dir
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
|
79
|
-
FileUtils.mkdir_p outdir
|
80
|
-
|
81
|
-
cmd = "python #{exe} " \
|
82
|
-
"-v #{vir_dir} " \
|
83
|
-
"-b #{host_dir} " \
|
84
|
-
"-o #{outdir} " \
|
85
|
-
"-d 1" # only compute d2star dissimilarity
|
86
|
-
|
87
|
-
Process.run_and_time_it! "Computing d2star dissimilarity", cmd
|
88
|
-
|
89
|
-
tmp_dir = File.join outdir, "tmp"
|
90
|
-
FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
|
91
|
-
|
92
|
-
bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
|
93
|
-
bad_files.each do |fname|
|
94
|
-
path = File.join outdir, fname
|
95
|
-
|
96
|
-
FileUtils.rm path if File.exist? path
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
23
|
+
WISH = File.join BIN, "WIsH"
|
24
|
+
VHM = File.join BIN, "vhm.py"
|
25
|
+
MUMMER = File.join BIN, "mummer"
|
26
|
+
RSCRIPT = "Rscript"
|
100
27
|
end
|
Binary file
|
data/vendor/bin/mac/vhm.py
CHANGED
@@ -46,8 +46,9 @@ if not os.path.exists(tmpDir) :
|
|
46
46
|
filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
|
47
47
|
|
48
48
|
## name length ##
|
49
|
-
nameLen = 93 - len(options.outDir)
|
49
|
+
# nameLen = 93 - len(options.outDir)
|
50
50
|
#### possibly because of the kmercount folder name for each contig is too long?
|
51
|
+
nameLen = 99999999999999
|
51
52
|
|
52
53
|
|
53
54
|
#################### 0: preparation ############################
|
@@ -211,7 +212,7 @@ if options.hostTaxaFile is None :
|
|
211
212
|
hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
|
212
213
|
hostTaxaFileWrite.close()
|
213
214
|
hostTaxaFileWrite = open(hostTaxaFile, 'a')
|
214
|
-
|
215
|
+
|
215
216
|
hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
|
216
217
|
for currentFileName in hostFaList :
|
217
218
|
if currentFileName.startswith('.') :
|
@@ -235,7 +236,7 @@ else :
|
|
235
236
|
hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
|
236
237
|
hostTaxaTable[hostTaxaTable=='']='unknown'
|
237
238
|
numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
|
238
|
-
|
239
|
+
|
239
240
|
filelog.flush()
|
240
241
|
|
241
242
|
#################### 1: count kmer and prepare list files ############################
|
@@ -259,7 +260,7 @@ for currentFileName in virusFaList :
|
|
259
260
|
filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
|
260
261
|
for w in range(1, (kmax+1)) :
|
261
262
|
currentFilePath = os.path.join(options.virusFaDir, currentFileName)
|
262
|
-
|
263
|
+
|
263
264
|
currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
|
264
265
|
cmdKmer = countKmerOut + " -l -k " + str(w) + \
|
265
266
|
" -i " + currentFilePath +\
|
@@ -278,9 +279,9 @@ for currentFileName in virusFaList :
|
|
278
279
|
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
279
280
|
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
280
281
|
sys.exit(0)
|
281
|
-
|
282
|
+
|
282
283
|
filelog.flush()
|
283
|
-
|
284
|
+
|
284
285
|
end_time = time.time()
|
285
286
|
count += 1
|
286
287
|
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
@@ -329,9 +330,9 @@ for currentFileName in hostFaList :
|
|
329
330
|
sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
330
331
|
filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
|
331
332
|
sys.exit(0)
|
332
|
-
|
333
|
+
|
333
334
|
filelog.flush()
|
334
|
-
|
335
|
+
|
335
336
|
end_time = time.time()
|
336
337
|
count += 1
|
337
338
|
#sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
|
@@ -366,7 +367,7 @@ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
|
|
366
367
|
sys.stdout.write(c.decode("utf-8"))
|
367
368
|
filelog.write(c.decode("utf-8"))
|
368
369
|
filelog.flush()
|
369
|
-
|
370
|
+
|
370
371
|
end_time = time.time()
|
371
372
|
count += 1
|
372
373
|
sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: big_simon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-07-
|
11
|
+
date: 2018-07-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,24 +52,66 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: parse_fasta
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.5'
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 2.5.2
|
65
|
+
type: :runtime
|
66
|
+
prerelease: false
|
67
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - "~>"
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '2.5'
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 2.5.2
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
76
|
name: rya
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
79
|
- - "~>"
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0.
|
81
|
+
version: 0.4.0
|
62
82
|
type: :runtime
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
86
|
- - "~>"
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0.
|
88
|
+
version: 0.4.0
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: trollop
|
91
|
+
requirement: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '2.1'
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 2.1.3
|
99
|
+
type: :runtime
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - "~>"
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '2.1'
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 2.1.3
|
69
109
|
description: Viral host discovery pipeline.
|
70
110
|
email:
|
71
111
|
- moorer@udel.edu
|
72
|
-
executables:
|
112
|
+
executables:
|
113
|
+
- big_simon
|
114
|
+
- ranks
|
73
115
|
extensions: []
|
74
116
|
extra_rdoc_files: []
|
75
117
|
files:
|
@@ -80,17 +122,25 @@ files:
|
|
80
122
|
- COPYING
|
81
123
|
- Gemfile
|
82
124
|
- Gemfile.lock
|
125
|
+
- Makefile
|
83
126
|
- README.md
|
84
127
|
- Rakefile
|
85
128
|
- big_simon.gemspec
|
86
129
|
- bin/console
|
87
130
|
- bin/setup
|
131
|
+
- exe/big_simon
|
132
|
+
- exe/ranks
|
88
133
|
- lib/big_simon.rb
|
134
|
+
- lib/big_simon/parsers.rb
|
135
|
+
- lib/big_simon/pipeline.rb
|
136
|
+
- lib/big_simon/runners.rb
|
137
|
+
- lib/big_simon/utils.rb
|
89
138
|
- lib/big_simon/version.rb
|
90
139
|
- vendor/bin/mac/WIsH
|
91
140
|
- vendor/bin/mac/computeMeasure.out
|
92
141
|
- vendor/bin/mac/computeMeasure_onlyd2star.out
|
93
142
|
- vendor/bin/mac/countKmer.out
|
143
|
+
- vendor/bin/mac/mummer
|
94
144
|
- vendor/bin/mac/vhm.py
|
95
145
|
homepage: https://github.com/mooreryan/big_simon
|
96
146
|
licenses: []
|