big_simon 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/Makefile +2 -2
- data/exe/big_simon +6 -26
- data/exe/{ranks → consensus_predictions} +15 -8
- data/lib/big_simon/runners.rb +108 -1
- data/lib/big_simon/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
|
4
|
+
data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
|
7
|
+
data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
|
data/Gemfile.lock
CHANGED
data/Makefile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
test_small:
|
2
|
-
rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
2
|
+
rm -r 0000TEST/; time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
3
3
|
|
4
4
|
test_small_install:
|
5
|
-
rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
5
|
+
rm -r 0000TEST/; rake install && time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
|
6
6
|
|
7
7
|
test_toy:
|
8
8
|
rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
|
data/exe/big_simon
CHANGED
@@ -71,7 +71,7 @@ tmpdir_host = File.join tmpdir, "host"
|
|
71
71
|
# all_predictions_fname = File.join outdir, "scores_all.txt"
|
72
72
|
mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
|
73
73
|
|
74
|
-
virus_recs, host_recs = [], []
|
74
|
+
# virus_recs, host_recs = [], []
|
75
75
|
|
76
76
|
# Tempfile.open do |vir_f|
|
77
77
|
# Tempfile.open do |host_f|
|
@@ -159,38 +159,18 @@ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmp
|
|
159
159
|
wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
|
160
160
|
vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
|
161
161
|
|
162
|
+
# TODO separate the parser from the runner for mummer.
|
162
163
|
host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
|
163
164
|
|
164
|
-
# Map them back to simple names. TODO just have it spit these out from the beginning.
|
165
|
-
host_info_mummer_simple_names = {}
|
166
|
-
inverted_name_map_virus = name_map_virus.invert
|
167
|
-
inverted_name_map_host = name_map_host.invert
|
168
|
-
|
169
|
-
host_info_mummer.each do |virus, host_tables|
|
170
|
-
virname = virus
|
171
|
-
if inverted_name_map_virus.has_key? virus
|
172
|
-
virname = inverted_name_map_virus[virus]
|
173
|
-
end
|
174
|
-
|
175
|
-
host_info_mummer_simple_names[virname] = []
|
176
|
-
|
177
|
-
host_tables.map do |table|
|
178
|
-
hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
|
179
|
-
new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
|
180
|
-
|
181
|
-
host_info_mummer_simple_names[virname] << new_table
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
165
|
host_info_wish = BigSimon::Parsers.wish wish_outf
|
186
166
|
host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
|
187
167
|
|
188
|
-
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm,
|
168
|
+
host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
|
189
169
|
host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
|
190
170
|
|
191
|
-
|
192
|
-
|
193
|
-
|
171
|
+
puts
|
172
|
+
pp host_info
|
173
|
+
puts
|
194
174
|
|
195
175
|
# Just a basic all info file
|
196
176
|
# File.open all_predictions_fname, "w" do |f|
|
@@ -3,10 +3,15 @@ Signal.trap("PIPE", "EXIT")
|
|
3
3
|
|
4
4
|
require "pp"
|
5
5
|
|
6
|
+
require "big_simon"
|
7
|
+
|
8
|
+
Rya::AbortIf.abort_unless ARGV.count >= 1, "usage: TOP=number consensus_predictions scores_scaled.*.txt > consensus_predictions.txt"
|
9
|
+
|
6
10
|
by_program = {}
|
7
11
|
lines = []
|
8
12
|
|
9
|
-
TOP = (ENV["TOP"] ||
|
13
|
+
TOP = (ENV["TOP"] || 3).to_i
|
14
|
+
AT_LEAST = (ENV["AT_LEAST"] || 2).to_i
|
10
15
|
|
11
16
|
ARGV.each do |fname|
|
12
17
|
scores = {}
|
@@ -78,16 +83,18 @@ by_program.each do |virus, program_tables|
|
|
78
83
|
end
|
79
84
|
end
|
80
85
|
|
81
|
-
first_table.each do |virus, host_counts|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
end
|
86
|
+
# first_table.each do |virus, host_counts|
|
87
|
+
# host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
88
|
+
# STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
|
89
|
+
# end
|
90
|
+
# STDERR.puts
|
91
|
+
# end
|
87
92
|
|
88
93
|
top_host_table.each do |virus, host_counts|
|
94
|
+
puts "\n\n>Virus-------#{virus}"
|
89
95
|
host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
|
96
|
+
if programs.count >= AT_LEAST
|
90
97
|
puts [virus, :top_N, host, programs.count, programs].join "\t"
|
98
|
+
end
|
91
99
|
end
|
92
|
-
puts
|
93
100
|
end
|
data/lib/big_simon/runners.rb
CHANGED
@@ -1,11 +1,118 @@
|
|
1
1
|
require "tempfile"
|
2
|
+
require "parse_fasta"
|
2
3
|
|
3
4
|
module BigSimon
|
4
5
|
class Runners
|
5
6
|
|
7
|
+
# @note To match the other things, you'd like them to be key'd on the file name.
|
8
|
+
def self.mummer exe, vir_dir, host_dir, outdir, threads
|
9
|
+
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
|
+
FileUtils.mkdir_p outdir
|
11
|
+
|
12
|
+
mummer_outfname = File.join outdir, "mummer_out.txt"
|
13
|
+
|
14
|
+
virus_fnames = Dir.glob(vir_dir + "/*")
|
15
|
+
host_fnames = Dir.glob(host_dir + "/*")
|
16
|
+
|
17
|
+
hit_table = {}
|
18
|
+
|
19
|
+
Tempfile.open do |vir_f|
|
20
|
+
Tempfile.open do |host_f|
|
21
|
+
virus_fnames.each do |fname|
|
22
|
+
Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
|
23
|
+
|
24
|
+
Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
|
25
|
+
# id needs to be the file name
|
26
|
+
new_id = File.basename fname.sub(/.fa$/, "")
|
27
|
+
|
28
|
+
hit_table[new_id] = {}
|
29
|
+
|
30
|
+
vir_f.puts ">#{new_id}\n#{rec.seq}"
|
31
|
+
|
32
|
+
vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
host_fnames.each do |fname|
|
37
|
+
Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
|
38
|
+
|
39
|
+
Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
|
40
|
+
new_id = File.basename fname.sub(/.fa$/, "")
|
41
|
+
|
42
|
+
# Add this host to each virus in the hit_table
|
43
|
+
hit_table.each do |virus, host_table|
|
44
|
+
host_table[new_id] = 0 # set it to defualt score of 0
|
45
|
+
end
|
46
|
+
|
47
|
+
host_f.puts ">#{new_id}\n#{rec.seq}"
|
48
|
+
host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
vir_f.fsync
|
53
|
+
host_f.fsync
|
54
|
+
|
55
|
+
cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
|
56
|
+
Process.run_and_time_it! "MUMMER", cmd
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
virus = nil
|
61
|
+
overall_max_score = 0
|
62
|
+
File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
|
63
|
+
line.chomp!
|
64
|
+
|
65
|
+
unless line.empty?
|
66
|
+
if line.start_with? ">"
|
67
|
+
virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
|
68
|
+
|
69
|
+
# It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)
|
70
|
+
|
71
|
+
Rya::AbortIf.assert hit_table.has_key?(virus)
|
72
|
+
# unless hit_table.has_key? virus
|
73
|
+
# hit_table[virus] = {}
|
74
|
+
# end
|
75
|
+
else
|
76
|
+
ary = line.strip.split " "
|
77
|
+
|
78
|
+
host = ary[0].sub(/___reverse$/, "").strip
|
79
|
+
score = ary[3].to_i
|
80
|
+
|
81
|
+
Rya::AbortIf.assert hit_table[virus].has_key?(host)
|
82
|
+
|
83
|
+
# unless hit_table[virus].has_key? host
|
84
|
+
# hit_table[virus][host] = -1
|
85
|
+
# end
|
86
|
+
|
87
|
+
# We only want the longest hit.
|
88
|
+
hit_table[virus][host] = score if score > hit_table[virus][host]
|
89
|
+
|
90
|
+
# Track the overall max for scaling.
|
91
|
+
overall_max_score = score if score > overall_max_score
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
results_table = {}
|
97
|
+
|
98
|
+
min, max, from, to = 0, overall_max_score, 1, 0
|
99
|
+
|
100
|
+
hit_table.each do |virus, host_table|
|
101
|
+
results_table[virus] = []
|
102
|
+
|
103
|
+
host_table.each do |host, score|
|
104
|
+
scaled_score = klass.scale score, min, max, from, to
|
105
|
+
|
106
|
+
results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
results_table
|
111
|
+
end
|
112
|
+
|
6
113
|
# This one's a bit different as it parses as well and returns original names.
|
7
114
|
# @todo Also do the reverse of each genome in case it's a contig.
|
8
|
-
def self.
|
115
|
+
def self.mummer2 exe, vir_dir, host_dir, outdir, threads
|
9
116
|
klass = Class.new.extend Rya::CoreExtensions::Math
|
10
117
|
FileUtils.mkdir_p outdir
|
11
118
|
|
data/lib/big_simon/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: big_simon
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -111,7 +111,7 @@ email:
|
|
111
111
|
- moorer@udel.edu
|
112
112
|
executables:
|
113
113
|
- big_simon
|
114
|
-
-
|
114
|
+
- consensus_predictions
|
115
115
|
extensions: []
|
116
116
|
extra_rdoc_files: []
|
117
117
|
files:
|
@@ -129,7 +129,7 @@ files:
|
|
129
129
|
- bin/console
|
130
130
|
- bin/setup
|
131
131
|
- exe/big_simon
|
132
|
-
- exe/
|
132
|
+
- exe/consensus_predictions
|
133
133
|
- lib/big_simon.rb
|
134
134
|
- lib/big_simon/parsers.rb
|
135
135
|
- lib/big_simon/pipeline.rb
|