big_simon 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/Makefile +2 -2
- data/exe/big_simon +6 -26
- data/exe/{ranks → consensus_predictions} +15 -8
- data/lib/big_simon/runners.rb +108 -1
- data/lib/big_simon/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
|
4
|
+
data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
|
7
|
+
data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
|
data/Gemfile.lock
CHANGED
data/Makefile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
test_small:
|
2
|
-
rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
2
|
+
rm -r 0000TEST/; time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
3
3
|
|
4
4
|
test_small_install:
|
5
|
-
rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
5
|
+
rm -r 0000TEST/; rake install && time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
6
6
|
|
7
7
|
test_toy:
|
8
8
|
rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
|
data/exe/big_simon
CHANGED
@@ -71,7 +71,7 @@ tmpdir_host = File.join tmpdir, "host"
|
|
71
71
|
# all_predictions_fname = File.join outdir, "scores_all.txt"
|
72
72
|
mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
|
73
73
|
|
74
|
-
virus_recs, host_recs = [], []
|
74
|
+
# virus_recs, host_recs = [], []
|
75
75
|
|
76
76
|
# Tempfile.open do |vir_f|
|
77
77
|
# Tempfile.open do |host_f|
|
@@ -159,38 +159,18 @@ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmp
|
|
159
159
|
wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
|
160
160
|
vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
|
161
161
|
|
162
|
+
# TODO separate the parser from the runner for mummer.
|
162
163
|
host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
|
163
164
|
|
164
|
-
# Map them back to simple names. TODO just have it spit these out from the beginning.
|
165
|
-
host_info_mummer_simple_names = {}
|
166
|
-
inverted_name_map_virus = name_map_virus.invert
|
167
|
-
inverted_name_map_host = name_map_host.invert
|
168
|
-
|
169
|
-
host_info_mummer.each do |virus, host_tables|
|
170
|
-
virname = virus
|
171
|
-
if inverted_name_map_virus.has_key? virus
|
172
|
-
virname = inverted_name_map_virus[virus]
|
173
|
-
end
|
174
|
-
|
175
|
-
host_info_mummer_simple_names[virname] = []
|
176
|
-
|
177
|
-
host_tables.map do |table|
|
178
|
-
hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
|
179
|
-
new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
|
180
|
-
|
181
|
-
host_info_mummer_simple_names[virname] << new_table
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
165
|
host_info_wish = BigSimon::Parsers.wish wish_outf
|
186
166
|
host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
|
187
167
|
|
188
|
-
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm,
|
168
|
+
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
|
189
169
|
host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
|
190
170
|
|
191
|
-
|
192
|
-
|
193
|
-
|
171
|
+
puts
|
172
|
+
pp host_info
|
173
|
+
puts
|
194
174
|
|
195
175
|
# Just a basic all info file
|
196
176
|
# File.open all_predictions_fname, "w" do |f|
|
@@ -3,10 +3,15 @@ Signal.trap("PIPE", "EXIT")
|
|
3
3
|
|
4
4
|
require "pp"
|
5
5
|
|
6
|
+
require "big_simon"
|
7
|
+
|
8
|
+
Rya::AbortIf.abort_unless ARGV.count >= 1, "usage: TOP=number consensus_predictions scores_scaled.*.txt > consensus_predictions.txt"
|
9
|
+
|
6
10
|
by_program = {}
|
7
11
|
lines = []
|
8
12
|
|
9
|
-
TOP = (ENV["TOP"] ||
|
13
|
+
TOP = (ENV["TOP"] || 3).to_i
|
14
|
+
AT_LEAST = (ENV["AT_LEAST"] || 2).to_i
|
10
15
|
|
11
16
|
ARGV.each do |fname|
|
12
17
|
scores = {}
|
@@ -78,16 +83,18 @@ by_program.each do |virus, program_tables|
|
|
78
83
|
end
|
79
84
|
end
|
80
85
|
|
81
|
-
first_table.each do |virus, host_counts|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
end
|
86
|
+
# first_table.each do |virus, host_counts|
|
87
|
+
# host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
88
|
+
# STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
|
89
|
+
# end
|
90
|
+
# STDERR.puts
|
91
|
+
# end
|
87
92
|
|
88
93
|
top_host_table.each do |virus, host_counts|
|
94
|
+
puts "\n\n>Virus-------#{virus}"
|
89
95
|
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
96
|
+
if programs.count >= AT_LEAST
|
90
97
|
puts [virus, :top_N, host, programs.count, programs].join "\t"
|
98
|
+
end
|
91
99
|
end
|
92
|
-
puts
|
93
100
|
end
|
data/lib/big_simon/runners.rb
CHANGED
@@ -1,11 +1,118 @@
|
|
1
1
|
require "tempfile"
|
2
|
+
require "parse_fasta"
|
2
3
|
|
3
4
|
module BigSimon
|
4
5
|
class Runners
|
5
6
|
|
7
|
+
# @note To match the other things, you'd like them to be key'd on the file name.
|
8
|
+
def self.mummer exe, vir_dir, host_dir, outdir, threads
|
9
|
+
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
|
+
FileUtils.mkdir_p outdir
|
11
|
+
|
12
|
+
mummer_outfname = File.join outdir, "mummer_out.txt"
|
13
|
+
|
14
|
+
virus_fnames = Dir.glob(vir_dir + "/*")
|
15
|
+
host_fnames = Dir.glob(host_dir + "/*")
|
16
|
+
|
17
|
+
hit_table = {}
|
18
|
+
|
19
|
+
Tempfile.open do |vir_f|
|
20
|
+
Tempfile.open do |host_f|
|
21
|
+
virus_fnames.each do |fname|
|
22
|
+
Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
|
23
|
+
|
24
|
+
Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
|
25
|
+
# id needs to be the file name
|
26
|
+
new_id = File.basename fname.sub(/.fa$/, "")
|
27
|
+
|
28
|
+
hit_table[new_id] = {}
|
29
|
+
|
30
|
+
vir_f.puts ">#{new_id}\n#{rec.seq}"
|
31
|
+
|
32
|
+
vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
host_fnames.each do |fname|
|
37
|
+
Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
|
38
|
+
|
39
|
+
Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
|
40
|
+
new_id = File.basename fname.sub(/.fa$/, "")
|
41
|
+
|
42
|
+
# Add this host to each virus in the hit_table
|
43
|
+
hit_table.each do |virus, host_table|
|
44
|
+
host_table[new_id] = 0 # set it to defualt score of 0
|
45
|
+
end
|
46
|
+
|
47
|
+
host_f.puts ">#{new_id}\n#{rec.seq}"
|
48
|
+
host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
vir_f.fsync
|
53
|
+
host_f.fsync
|
54
|
+
|
55
|
+
cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
|
56
|
+
Process.run_and_time_it! "MUMMER", cmd
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
virus = nil
|
61
|
+
overall_max_score = 0
|
62
|
+
File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
|
63
|
+
line.chomp!
|
64
|
+
|
65
|
+
unless line.empty?
|
66
|
+
if line.start_with? ">"
|
67
|
+
virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
68
|
+
|
69
|
+
# It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)
|
70
|
+
|
71
|
+
Rya::AbortIf.assert hit_table.has_key?(virus)
|
72
|
+
# unless hit_table.has_key? virus
|
73
|
+
# hit_table[virus] = {}
|
74
|
+
# end
|
75
|
+
else
|
76
|
+
ary = line.strip.split " "
|
77
|
+
|
78
|
+
host = ary[0].sub(/___reverse$/, "").strip
|
79
|
+
score = ary[3].to_i
|
80
|
+
|
81
|
+
Rya::AbortIf.assert hit_table[virus].has_key?(host)
|
82
|
+
|
83
|
+
# unless hit_table[virus].has_key? host
|
84
|
+
# hit_table[virus][host] = -1
|
85
|
+
# end
|
86
|
+
|
87
|
+
# We only want the longest hit.
|
88
|
+
hit_table[virus][host] = score if score > hit_table[virus][host]
|
89
|
+
|
90
|
+
# Track the overall max for scaling.
|
91
|
+
overall_max_score = score if score > overall_max_score
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
results_table = {}
|
97
|
+
|
98
|
+
min, max, from, to = 0, overall_max_score, 1, 0
|
99
|
+
|
100
|
+
hit_table.each do |virus, host_table|
|
101
|
+
results_table[virus] = []
|
102
|
+
|
103
|
+
host_table.each do |host, score|
|
104
|
+
scaled_score = klass.scale score, min, max, from, to
|
105
|
+
|
106
|
+
results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
results_table
|
111
|
+
end
|
112
|
+
|
6
113
|
# This one's a bit different as it parses as well and returns original names.
|
7
114
|
# @todo Also do the reverse of each genome in case it's a contig.
|
8
|
-
def self.
|
115
|
+
def self.mummer2 exe, vir_dir, host_dir, outdir, threads
|
9
116
|
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
117
|
FileUtils.mkdir_p outdir
|
11
118
|
|
data/lib/big_simon/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: big_simon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -111,7 +111,7 @@ email:
|
|
111
111
|
- moorer@udel.edu
|
112
112
|
executables:
|
113
113
|
- big_simon
|
114
|
-
-
|
114
|
+
- consensus_predictions
|
115
115
|
extensions: []
|
116
116
|
extra_rdoc_files: []
|
117
117
|
files:
|
@@ -129,7 +129,7 @@ files:
|
|
129
129
|
- bin/console
|
130
130
|
- bin/setup
|
131
131
|
- exe/big_simon
|
132
|
-
- exe/
|
132
|
+
- exe/consensus_predictions
|
133
133
|
- lib/big_simon.rb
|
134
134
|
- lib/big_simon/parsers.rb
|
135
135
|
- lib/big_simon/pipeline.rb
|