big_simon 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 544705d81f538f636b61b014d4fa84d0a0b68918
4
- data.tar.gz: e8943460d42bca98943fc27ae7919601cefcbe1a
3
+ metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
4
+ data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
5
5
  SHA512:
6
- metadata.gz: 8e0c2f5be8f1dbcdaa0149788db1fe05c2cdc7e5c0988d20311f8cf4a37aa3c0869d271704f606a00cab7d7bfb03e81a07f89bce52b92b933495fc62efdd26ac
7
- data.tar.gz: e50a57bb282bb20274cb49673b657574b277d61af80c0a510caa6938fc5699eb368d3bcc133d9759d53dab1f6f57c3bd11c9c537227444bf6b6f7602b6f59bb4
6
+ metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
7
+ data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
data/.gitignore CHANGED
@@ -18,4 +18,16 @@
18
18
 
19
19
  vendor/repos
20
20
 
21
- test_files/test_output
21
+ test_files/test_output
22
+
23
+ 0000TEST
24
+
25
+ apple.txt
26
+ stats.r
27
+ toyexample_out/
28
+
29
+ spec/test_files/outdir_for_heatmaps/outdir/
30
+
31
+ stats.tmp.r
32
+
33
+ scratch
data/Gemfile.lock CHANGED
@@ -1,14 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.0.1)
5
- rya (~> 0.1.3)
4
+ big_simon (0.1.0)
5
+ parse_fasta (~> 2.5, >= 2.5.2)
6
+ rya (~> 0.4.0)
7
+ trollop (~> 2.1, >= 2.1.3)
6
8
 
7
9
  GEM
8
10
  remote: https://rubygems.org/
9
11
  specs:
10
12
  abort_if (0.2.0)
11
13
  diff-lcs (1.3)
14
+ parse_fasta (2.5.2)
12
15
  rake (10.5.0)
13
16
  rspec (3.7.0)
14
17
  rspec-core (~> 3.7.0)
@@ -23,10 +26,11 @@ GEM
23
26
  diff-lcs (>= 1.2.0, < 2.0)
24
27
  rspec-support (~> 3.7.0)
25
28
  rspec-support (3.7.1)
26
- rya (0.1.3)
29
+ rya (0.4.0)
27
30
  abort_if (~> 0.2.0)
28
31
  systemu (~> 2.6, >= 2.6.5)
29
32
  systemu (2.6.5)
33
+ trollop (2.1.3)
30
34
 
31
35
  PLATFORMS
32
36
  ruby
data/Makefile ADDED
@@ -0,0 +1,8 @@
1
+ test_small:
2
+ rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
3
+
4
+ test_small_install:
5
+ rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
6
+
7
+ test_toy:
8
+ rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # BigSimon
2
2
 
3
- A pipeline for finding hosts of viruses!
3
+ Hi, I'm BigSimon (but you can call me BigSi if you want), and I'm a pipeline for finding hosts of viruses!
4
+
5
+ Mainly, I'm just a wrapper for some other nice tools.
4
6
 
5
7
  ## Installation
6
8
 
@@ -20,7 +22,11 @@ Or install it yourself as:
20
22
 
21
23
  ## Usage
22
24
 
23
- TODO
25
+ For now, there is not much documentation. To see the help file, run:
26
+
27
+ ```
28
+ $ big_simon --help
29
+ ```
24
30
 
25
31
  ## Development
26
32
 
data/big_simon.gemspec CHANGED
@@ -26,5 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
28
 
29
- spec.add_runtime_dependency "rya", "~> 0.1.3"
29
+ spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
30
+ spec.add_runtime_dependency "rya", "~> 0.4.0"
31
+ spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
30
32
  end
data/exe/big_simon ADDED
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env ruby
2
+ Signal.trap("PIPE", "EXIT")
3
+
4
+ require "pp"
5
+ require "tempfile"
6
+
7
+ require "parse_fasta"
8
+ require "trollop"
9
+
10
+ require "big_simon"
11
+
12
+ # TODO make scaled scores with high score being better.
13
+
14
+ Process.extend Rya::CoreExtensions::Process
15
+
16
+ opts = Trollop.options do
17
+ version BigSimon::VERSION_BANNER
18
+
19
+ banner <<-EOS
20
+
21
+ #{BigSimon::VERSION_BANNER}
22
+
23
+ Hi, I'm BigSimon! I'm here to help you figure out the hosts for
24
+ your viruses.
25
+
26
+ I run a bunch of different programs. In addition to doing some
27
+ merging of results, I'll give you heatmaps for all the programs and
28
+ you can check for yourself.
29
+
30
+ The scaled scores run from 0 to 1 with lower scores being better.
31
+
32
+ Options:
33
+ EOS
34
+
35
+ opt :viruses, "Path to fasta file(s) with viruses", type: :strings
36
+ opt :hosts, "Path to fasta file(s) with hosts", type: :strings
37
+ opt :outdir, "Output directory", default: "big_simon"
38
+ opt :threads, "Number of threads to use", default: 1
39
+ end
40
+
41
+ Rya::AbortIf.logger.debug { "Command line opts: #{opts.inspect}" }
42
+
43
+ BigSimon::Utils.check_opt! opts, :viruses
44
+ BigSimon::Utils.check_opt! opts, :hosts
45
+
46
+ # Check infiles
47
+ [opts[:viruses], opts[:hosts]].flatten.each do |fname|
48
+ BigSimon::Utils.check_file! fname
49
+ end
50
+
51
+ Rya::AbortIf.abort_unless opts[:threads] > 0,
52
+ "--threads must be > 0"
53
+
54
+ programs = [
55
+ "WIsH",
56
+ "VirHostMatcher",
57
+ "mummer"
58
+ ]
59
+
60
+ outdir = opts[:outdir]
61
+ threads = opts[:threads]
62
+ virus_fnames = opts[:viruses]
63
+ host_fnames = opts[:hosts]
64
+
65
+ FileUtils.mkdir_p outdir
66
+
67
+ tmpdir = File.join opts[:outdir], "big_simon_tmp"
68
+ tmpdir_virus = File.join tmpdir, "virus"
69
+ tmpdir_host = File.join tmpdir, "host"
70
+
71
+ # all_predictions_fname = File.join outdir, "scores_all.txt"
72
+ mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
+
74
+ virus_recs, host_recs = [], []
75
+
76
+ # Tempfile.open do |vir_f|
77
+ # Tempfile.open do |host_f|
78
+ # virus_fnames.each do |fname|
79
+ # ParseFasta::SeqFile.open(fname).each_record do |rec|
80
+ # vir_f.puts rec
81
+ #
82
+ # vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
83
+ # end
84
+ # end
85
+ #
86
+ # host_fnames.each do |fname|
87
+ # ParseFasta::SeqFile.open(fname).each_record do |rec|
88
+ # host_f.puts rec
89
+ # host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
90
+ # end
91
+ # end
92
+ #
93
+ # vir_f.fsync
94
+ # host_f.fsync
95
+ #
96
+ # cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
97
+ # Process.run_and_time_it! "MUMMER", cmd
98
+ # end
99
+ # end
100
+ #
101
+ # header = nil
102
+ # hits = []
103
+ # hit_info = {}
104
+ # virus = nil
105
+ #
106
+ # File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
107
+ # if line.start_with? '>'
108
+ # virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
109
+ #
110
+ # unless hit_info.has_key? virus
111
+ # hit_info[virus] = {}
112
+ # end
113
+ # else
114
+ # host, _, _, len = line.chomp.strip.split(" ")
115
+ # host = host.sub(/___reverse$/, "").strip
116
+ #
117
+ # unless hit_info[virus].has_key? host
118
+ # hit_info[virus][host] = -1
119
+ # end
120
+ #
121
+ # hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
122
+ # end
123
+ # end
124
+ #
125
+ # puts
126
+ #
127
+ # hh = hit_info.map do |virus, info|
128
+ # [virus, info.to_a.sort_by {|gen, len| len}.reverse]
129
+ # end
130
+ #
131
+ # pp hh
132
+
133
+ # hh = hit_info.map do |virus, info|
134
+ # [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
135
+ #
136
+ # end
137
+ # p hit_info
138
+
139
+ scores_files = {}
140
+ programs.each do |program|
141
+ raw_fname = File.join outdir, "scores_raw.#{program}.txt"
142
+ scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
143
+
144
+ scores_files[program] = {
145
+ raw: File.open(raw_fname, "w"),
146
+ scaled: File.open(scaled_fname, "w")
147
+ }
148
+ end
149
+
150
+ scores_files.each do |program, files|
151
+ files.each do |name, file|
152
+ file.puts %w[virus host score].join "\t"
153
+ end
154
+ end
155
+
156
+ name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
157
+ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
158
+
159
+ wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
160
+ vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
161
+
162
+ host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
163
+
164
+ # Map them back to simple names. TODO just have it spit these out from the beginning.
165
+ host_info_mummer_simple_names = {}
166
+ inverted_name_map_virus = name_map_virus.invert
167
+ inverted_name_map_host = name_map_host.invert
168
+
169
+ host_info_mummer.each do |virus, host_tables|
170
+ virname = virus
171
+ if inverted_name_map_virus.has_key? virus
172
+ virname = inverted_name_map_virus[virus]
173
+ end
174
+
175
+ host_info_mummer_simple_names[virname] = []
176
+
177
+ host_tables.map do |table|
178
+ hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
179
+ new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
180
+
181
+ host_info_mummer_simple_names[virname] << new_table
182
+ end
183
+ end
184
+
185
+ host_info_wish = BigSimon::Parsers.wish wish_outf
186
+ host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
187
+
188
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
189
+ host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
190
+
191
+ # puts
192
+ # pp host_info
193
+ # puts
194
+
195
+ # Just a basic all info file
196
+ # File.open all_predictions_fname, "w" do |f|
197
+ # f.puts %w[virus host program score scaled.score].join "\t"
198
+
199
+ host_info.each do |virus, h1|
200
+ h1.each do |host, h2|
201
+ lines = {}
202
+
203
+ h2[:scores].each do |program, score|
204
+ lines[[virus, host, program]] = [score]
205
+
206
+ scores_files[program][:raw].puts [virus, host, score].join "\t"
207
+ end
208
+
209
+ # Add in the scaled score too.
210
+ h2[:scaled_scores].each do |program, score|
211
+ lines[[virus, host, program]] << score
212
+
213
+ scores_files[program][:scaled].puts [virus, host, score].join "\t"
214
+ end
215
+
216
+ # lines.each do |(virus, host, program), (score, scaled_score)|
217
+ # f.puts [virus, host, program, score, scaled_score].join "\t"
218
+ # end
219
+ end
220
+ end
221
+ # end
222
+
223
+ # A file with mean scaled scores.
224
+ File.open mean_scaled_scores_fname, "w" do |f|
225
+ f.puts %w[virus host score].join "\t"
226
+
227
+ host_info.each do |virus, h1|
228
+ h1.each do |host, h2|
229
+ scaled_scores = h2[:scaled_scores].values
230
+
231
+ mean_scaled_score = scaled_scores.reduce(:+) / scaled_scores.length.to_f
232
+
233
+ f.puts [virus, host, mean_scaled_score].join "\t"
234
+ end
235
+ end
236
+ end
237
+
238
+ scores_files.each do |program, file|
239
+ file.values.map(&:close)
240
+ end
241
+ FileUtils.rm_r tmpdir
242
+
243
+ # Make the heatmaps
244
+ BigSimon::Runners.heatmaps BigSimon::RSCRIPT, outdir, File.join(outdir, "heatmaps")
data/exe/ranks ADDED
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ Signal.trap("PIPE", "EXIT")
3
+
4
+ require "pp"
5
+
6
+ by_program = {}
7
+ lines = []
8
+
9
+ TOP = (ENV["TOP"] || 5).to_i
10
+
11
+ ARGV.each do |fname|
12
+ scores = {}
13
+ File.open(fname, "rt").each_line.with_index do |line, idx|
14
+ unless idx.zero?
15
+ virus, host, score = line.chomp.split "\t"
16
+
17
+ unless scores.has_key? virus
18
+ scores[virus] = []
19
+ end
20
+
21
+ scores[virus] << [host, score.to_f]
22
+ end
23
+ end
24
+
25
+ scores.sort_by { |virus, _| virus }.each do |virus, host_scores|
26
+ # Lowest score is the best
27
+ top_5 = host_scores.sort_by { |host, score| score }.take(TOP).map(&:first)
28
+
29
+ line = [File.basename(fname), virus, top_5]
30
+ lines << line
31
+
32
+ # puts line.join "\t"
33
+ end
34
+ # puts
35
+ end
36
+
37
+ lines.each do |line|
38
+ program, virus, all = line
39
+ first = all.first
40
+
41
+ unless by_program.has_key? virus
42
+ by_program[virus] = {}
43
+ end
44
+
45
+ unless program == "scores_scaled.mean.txt"
46
+ by_program[virus][program] = { first: first, all: all }
47
+ end
48
+ end
49
+
50
+ # These track the number of times a host shows up in the first spot and in the top N spots for that virus for all programs.
51
+ first_table = {}
52
+ top_host_table = {}
53
+
54
+ by_program.each do |virus, program_tables|
55
+ first_table[virus] = {}
56
+ top_host_table[virus] = {}
57
+
58
+ program_tables.each do |program, top_info|
59
+
60
+
61
+ first_host = top_info[:first]
62
+ all_top = top_info[:all]
63
+
64
+ unless first_table[virus].has_key? first_host
65
+ first_table[virus][first_host] = []
66
+ end
67
+
68
+
69
+ first_table[virus][first_host] << program
70
+
71
+ all_top.each do |top_host|
72
+ unless top_host_table[virus].has_key? top_host
73
+ top_host_table[virus][top_host] = []
74
+ end
75
+
76
+ top_host_table[virus][top_host] << program
77
+ end
78
+ end
79
+ end
80
+
81
+ first_table.each do |virus, host_counts|
82
+ host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
83
+ STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
84
+ end
85
+ STDERR.puts
86
+ end
87
+
88
+ top_host_table.each do |virus, host_counts|
89
+ host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
90
+ puts [virus, :top_N, host, programs.count, programs].join "\t"
91
+ end
92
+ puts
93
+ end
@@ -0,0 +1,78 @@
1
+ module BigSimon
2
+ # Methods for parsing output files
3
+ class Parsers
4
+
5
+ # @note VirHostMatcher returns true distances that run from 0 to 1, so it doesn't need scaling.
6
+ # @note VirHostMatcher includes the whole file name as the id of the organism, so we chop off some common endings.
7
+ def self.vir_host_matcher fname
8
+ hosts = nil
9
+
10
+ host_info = {}
11
+ File.open(fname, "rt").each_line.with_index do |line, idx|
12
+ line.chomp!
13
+ line.sub! /,$/, "" # git rid of trailing commas
14
+
15
+ if idx.zero?
16
+ stat, *hosts = line.split ","
17
+
18
+ hosts.map! { |str| BigSimon::Utils.strip_suffix str }
19
+ else
20
+ ary = line.split ","
21
+ virus = BigSimon::Utils.strip_suffix ary.shift
22
+
23
+ # In this case the best value is the lowest distance.
24
+ dists = ary.map.
25
+ with_index do |dist, idx|
26
+ { host: hosts[idx], score: dist.to_f, scaled_score: dist.to_f }
27
+ end.sort_by { |ht| ht[:scaled_score] }
28
+
29
+
30
+ host_info[virus] = dists
31
+ end
32
+ end
33
+
34
+ host_info
35
+ end
36
+
37
+ # @note WIsH gives log likelihoods so the scaled value is actually scaled.
38
+ # @note The viruses and hosts will have the ID rather than the file name.
39
+ def self.wish fname
40
+ viruses = nil
41
+
42
+ host_info = {}
43
+
44
+ hosts = nil
45
+ File.open(fname, "rt").each_line.map.with_index do |line, idx|
46
+ line.chomp!
47
+
48
+ if idx.zero?
49
+ ary = line.split("\t")
50
+ ary.unshift("")
51
+ else
52
+ ary = line.split("\t")
53
+ end
54
+ end.transpose.each_with_index do |line_ary, idx|
55
+ if idx.zero?
56
+ hosts = line_ary.drop(1)
57
+ else
58
+ virus = line_ary.shift
59
+
60
+ scores = line_ary.map(&:to_f)
61
+
62
+ host_vals = scores.map.with_index do |score, idx|
63
+ { host: hosts[idx], score: score, scaled_score: 1 - Math.exp(score) }
64
+ end
65
+
66
+ host_info[virus] = host_vals
67
+ end
68
+
69
+ host_info.each do |virus, hosts|
70
+ hosts.sort_by! { |ht| ht[:scaled_score] }
71
+ end
72
+ end
73
+
74
+ host_info
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,64 @@
1
+ module BigSimon
2
+ class Pipeline
3
+ # @param collated_results_table { virus => host => score_type => program => score }
4
+ def self.map_taxa collated_results_table, virus_name_map, host_name_map
5
+ new_results_table = {}
6
+
7
+ collated_results_table.each do |virus_name, host_table|
8
+ if virus_name_map.include? virus_name
9
+ new_virus_name = virus_name_map[virus_name]
10
+ else
11
+ new_virus_name = virus_name
12
+ end
13
+
14
+ new_results_table[new_virus_name] = {}
15
+
16
+ host_table.each do |host_name, score_table|
17
+ if host_name_map.include? host_name
18
+ new_host_name = host_name_map[host_name]
19
+ else
20
+ new_host_name = host_name
21
+ end
22
+
23
+ new_results_table[new_virus_name][new_host_name] = score_table
24
+ end
25
+ end
26
+
27
+ new_results_table
28
+ end
29
+
30
+ # @param [Array<Hash>] results_table host info hash tables. See functions in Parsers class.
31
+ # @param [Array<String>] programs names of programs generating hash tables (in same order as host_data)
32
+ def self.collate_host_results results_table, programs
33
+ Rya::AbortIf.assert results_table.count == programs.count
34
+
35
+ virus_host_scores = {}
36
+ all_viruses = results_table.reduce(Set.new) { |acc, ht| acc + ht.keys }
37
+
38
+ all_viruses.each do |virus|
39
+ virus_host_scores[virus] = {}
40
+ end
41
+
42
+ results_table.each_with_index do |ht, idx|
43
+ program = programs[idx]
44
+
45
+ ht.each do |virus, host_scores|
46
+ host_scores.each do |ht|
47
+ host = ht[:host]
48
+ score = ht[:score]
49
+ scaled_score = ht[:scaled_score]
50
+
51
+ unless virus_host_scores[virus].has_key? host
52
+ virus_host_scores[virus][host] = { scores: {}, scaled_scores: {}}
53
+ end
54
+
55
+ virus_host_scores[virus][host][:scores][program] = score
56
+ virus_host_scores[virus][host][:scaled_scores][program] = scaled_score
57
+ end
58
+ end
59
+ end
60
+
61
+ virus_host_scores
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,189 @@
1
+ require "tempfile"
2
+
3
+ module BigSimon
4
+ class Runners
5
+
6
+ # This one's a bit different as it parses as well and returns original names.
7
+ # @todo Also do the reverse of each genome in case it's a contig.
8
+ def self.mummer exe, vir_dir, host_dir, outdir, threads
9
+ klass = Class.new.extend Rya::CoreExtensions::Math
10
+ FileUtils.mkdir_p outdir
11
+
12
+ # TODO put these all in one file then do it?
13
+
14
+ results = {}
15
+
16
+ # Takes names in files and puts them to the file names
17
+ name_map = {}
18
+
19
+ Dir.glob(vir_dir + "/*").each do |vir_fname|
20
+ this_virus_scores = []
21
+ virus = nil
22
+
23
+ Dir.glob(host_dir + "/*").each do |host_fname|
24
+ vir_base = File.basename vir_fname
25
+ host_base = File.basename host_fname
26
+ outfname = File.join outdir, "#{vir_base}___#{host_base}.mummer"
27
+
28
+ # -l is min length of a match TODO pull this into a const
29
+ # -F to force 4 columns
30
+ cmd = "#{exe} -F " \
31
+ "-maxmatch " \
32
+ "-l 15 " \
33
+ "#{host_fname} " \
34
+ "#{vir_fname} " \
35
+ "> #{outfname}"
36
+
37
+ Process.run_and_time_it! "Calculating matches", cmd
38
+
39
+ # Note there should only be one '>' per file here.
40
+ host = nil
41
+ score = 0
42
+ File.open(outfname, "rt").each_line.with_index do |line, idx|
43
+ if idx.zero?
44
+ this_virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
45
+
46
+ Rya::AbortIf::abort_unless(this_virus == virus, "OOPS") if virus
47
+
48
+ virus ||= this_virus
49
+ else
50
+ ary = line.chomp.strip.split(" ")
51
+ Rya::AbortIf.abort_unless ary.count == 4, "Problem parsing #{outfname} (mummer output)"
52
+
53
+ host = ary[0].sub(/___reverse$/, "").strip
54
+ len = ary[3].to_i
55
+
56
+ score = len if len > score
57
+ end
58
+ end
59
+
60
+ this_virus_scores << score
61
+
62
+ unless results.has_key? virus
63
+ results[virus] = []
64
+ end
65
+
66
+ results[virus] << { host: host, score: score, scaled_score: nil }
67
+
68
+ FileUtils.rm outfname
69
+ end
70
+
71
+ # This was the original scaling, i.e. per virus
72
+ # min = 0 # this_virus_scores.min # Technically, this should range from 0 to 15. Any data missing from this table would give a zero. TODO we don't actually account for this though.
73
+ # max = this_virus_scores.max
74
+ # from = 1
75
+ # to = 0
76
+ #
77
+ # results[virus].each do |host_table|
78
+ # host_table[:scaled_score] = klass.scale host_table[:score], min, max, from, to
79
+ # end
80
+ end
81
+
82
+ all_scores = []
83
+ results.each do |virus, host_tables|
84
+ all_scores << host_tables.map { |table| table[:score] }
85
+ end
86
+
87
+ all_scores.flatten!
88
+ max = all_scores.max
89
+
90
+ results.each do |virus, host_tables|
91
+ host_tables.each do |host_table|
92
+ host_table[:scaled_score] = klass.scale host_table[:score], 0, max, 1, 0
93
+ end
94
+ end
95
+
96
+ results
97
+ end
98
+
99
+ def self.vir_host_matcher exe, vir_dir, host_dir, outdir
100
+ FileUtils.mkdir_p outdir
101
+
102
+ cmd = "python #{exe} " \
103
+ "-v #{vir_dir} " \
104
+ "-b #{host_dir} " \
105
+ "-o #{outdir} " \
106
+ "-d 1" # only compute d2star dissimilarity
107
+
108
+ Process.run_and_time_it! "Computing d2star dissimilarity", cmd
109
+
110
+ tmp_dir = File.join outdir, "tmp"
111
+ FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
112
+
113
+ bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
114
+ bad_files.each do |fname|
115
+ path = File.join outdir, fname
116
+
117
+ FileUtils.rm path if File.exist? path
118
+ end
119
+
120
+ outf = File.join outdir, "d2star_k6.csv"
121
+ new_outf = File.join outdir, "vir_host_matcher.txt"
122
+ FileUtils.mv outf, new_outf
123
+
124
+ new_outf
125
+ end
126
+
127
+ # Runs the WIsH program
128
+ #
129
+ # @raise [AbortIf::Exit] if commands fail
130
+ def self.wish exe, vir_dir, host_dir, outdir, threads
131
+ model_dir = File.join outdir, "model"
132
+
133
+ FileUtils.mkdir_p model_dir
134
+
135
+ build_model = "#{exe} " \
136
+ "-t #{threads} " \
137
+ "-c build " \
138
+ "-g #{host_dir} " \
139
+ "-m #{model_dir}"
140
+
141
+ predict = "#{exe} " \
142
+ "-t #{threads} " \
143
+ "-c predict " \
144
+ "-g #{vir_dir} " \
145
+ "-m #{model_dir} " \
146
+ "-r #{outdir}"
147
+
148
+ Process.run_and_time_it! "Building model", build_model
149
+ Process.run_and_time_it! "Predicting host", predict
150
+
151
+ FileUtils.rm_r model_dir if Dir.exist? model_dir
152
+
153
+ outf = File.join outdir, "llikelihood.matrix"
154
+ new_outf = File.join outdir, "wish.txt"
155
+ FileUtils.mv outf, new_outf
156
+
157
+ new_outf
158
+ end
159
+
160
+ def self.heatmaps exe, indir, outdir
161
+ FileUtils.mkdir_p outdir
162
+
163
+ fnames = Dir.glob("#{indir}/scores*.txt").map do |in_fname|
164
+ extname = File.extname in_fname
165
+ basename = File.basename in_fname, extname
166
+
167
+ out_fname = File.join outdir, "#{basename}.heatmap.pdf"
168
+
169
+ [in_fname, out_fname]
170
+ end
171
+
172
+
173
+ rcode_str = BigSimon::Utils.rcode fnames
174
+
175
+ Object::Tempfile.open do |f|
176
+ f.puts rcode_str
177
+ f.fsync # ensure no data is buffered
178
+
179
+
180
+ cmd = "#{exe} #{f.path}"
181
+ Process.run_and_time_it! "Drawing heatmaps", cmd
182
+ end
183
+
184
+ out_fnames = fnames.map(&:last)
185
+ end
186
+ end
187
+ end
188
+
189
+
@@ -0,0 +1,108 @@
1
+ module BigSimon
2
+ # @todo These don't have unit tests yet.
3
+ # @note Skips any duplicate IDs. Only keeps the first one.
4
+ class Utils
5
+ def self.check_file! fname
6
+ Rya::AbortIf.abort_if fname && !File.exist?(fname),
7
+ "#{fname} doesn't exist! Try big_simon --help for help."
8
+ end
9
+
10
+ def self.check_opt! opts, arg
11
+ Rya::AbortIf.abort_unless opts.send(:fetch, "#{arg}_given".to_sym),
12
+ "You must specify --#{arg.to_s.tr('_', '-')}. Try big_simon --help for help."
13
+ end
14
+
15
+ def self.rcode fnames
16
+ functions = %Q|
17
+ library(reshape2)
18
+ library(gplots)
19
+ library(RColorBrewer)
20
+
21
+ file.join <- function(...) {
22
+ paste(..., sep="/")
23
+ }
24
+
25
+ draw.heatmap <- function(infname, outfname) {
26
+ dat <- read.table(infname, header=T, sep="\t")
27
+
28
+ wide.dat <- dcast(dat, host ~ virus, value.var="score")
29
+
30
+ hosts <- wide.dat[, 1]
31
+ scores <- wide.dat[, 2:ncol(wide.dat)]
32
+ scores.numeric <- apply(scores, 2, as.numeric)
33
+
34
+ scores.matrix <- as.matrix(scores.numeric)
35
+
36
+ rownames(scores.matrix) <- hosts
37
+
38
+ palette <- "YlOrBr"
39
+ col <- colorRampPalette(brewer.pal(n=9, palette))(n = 25)
40
+ size <- 0.75
41
+
42
+ pdf(outfname, height=5, width=8)
43
+
44
+ heatmap.2(scores.matrix,
45
+ trace="none", ## Disable those wonky lines.
46
+ col=col, ## Set the color.
47
+
48
+ ## Size opts
49
+ margins=c(11, 11), cexRow=size, cexCol=size,
50
+
51
+ ## Key labeling
52
+ key.xlab="Score")
53
+
54
+ invisible(dev.off())
55
+ }
56
+
57
+ |
58
+
59
+ drawing = fnames.map do |in_fname, out_fname|
60
+ %Q{
61
+
62
+ draw.heatmap("#{in_fname}", "#{out_fname}")
63
+ }
64
+ end.join
65
+
66
+ [functions, drawing].join "\n"
67
+ end
68
+
69
+ def self.scale_log_likelihood ll
70
+ 1 - Math.exp(ll)
71
+ end
72
+
73
+ def self.set_up_tmp_dirs fastas, tmpdir, which
74
+ Object::FileUtils.mkdir_p tmpdir
75
+
76
+ name_map = {}
77
+ all_ids = Set.new
78
+
79
+ seq_num = -1
80
+ fastas.each do |fname|
81
+ ParseFasta::SeqFile.open(fname).each_record do |rec|
82
+ if all_ids.include? rec.id
83
+ Rya::AbortIf.logger.warn { "#{rec.id} was seen more than one time! Duplicate organism IDs are not allowed, so we will only keep the first one." }
84
+ else
85
+ all_ids << rec.id
86
+
87
+ seq_num += 1
88
+
89
+ new_id = "#{which}_#{seq_num}"
90
+ name_map[new_id] = rec.id
91
+
92
+ outfname = File.join tmpdir, "#{new_id}.fa"
93
+
94
+ File.open(outfname, "w") do |f|
95
+ f.puts rec
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ [name_map, all_ids]
102
+ end
103
+
104
+ def self.strip_suffix fname
105
+ fname.sub /.fasta$|.fa$/, ""
106
+ end
107
+ end
108
+ end
@@ -1,3 +1,15 @@
1
1
  module BigSimon
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
+
4
+ COPYRIGHT = "2018 Ryan Moore"
5
+ CONTACT = "moorer@udel.edu"
6
+ WEBSITE = "https://github.com/mooreryan/InteinFinder"
7
+ LICENSE = "GPLv3"
8
+
9
+ VERSION_BANNER =
10
+ " # Version: v#{VERSION}
11
+ # Copyright: #{COPYRIGHT}
12
+ # Contact: #{CONTACT}
13
+ # License: #{LICENSE}"
14
+
3
15
  end
data/lib/big_simon.rb CHANGED
@@ -1,100 +1,27 @@
1
1
  require "rya"
2
+ require "set"
3
+ require "pp"
2
4
 
3
5
  require "big_simon/version"
4
6
 
7
+ require "big_simon/utils"
8
+
9
+ require "big_simon/runners"
10
+ require "big_simon/parsers"
11
+ require "big_simon/pipeline"
12
+
5
13
  Time.extend Rya::CoreExtensions::Time
6
14
  Process.extend Rya::CoreExtensions::Process
15
+ Array.include Rya::CoreExtensions::Array
16
+ Math.extend Rya::CoreExtensions::Math
7
17
 
8
18
  module BigSimon
9
-
10
- # Project directories
11
19
  ROOT = File.join __dir__, ".."
12
20
  BIN = File.join ROOT, "vendor", "bin", "mac"
13
21
  SPEC = File.join ROOT, "spec"
14
22
  TEST_FILES = File.join SPEC, "test_files"
15
-
16
- class Parsers
17
-
18
- def self.vir_host_matcher fname
19
- hosts = nil
20
-
21
- host_info = {}
22
- File.open(fname, "rt").each_line.with_index do |line, idx|
23
- line.chomp!
24
- line.sub! /,$/, "" # git rid of trailing commas
25
-
26
- if idx.zero?
27
- stat, *hosts = line.split ","
28
- else
29
- ary = line.split ","
30
- virus = ary.shift
31
-
32
- dists = ary.map.
33
- with_index { |dist, idx| [hosts[idx], dist.to_f] }.
34
- sort_by { |_, dist| dist }
35
-
36
- best_host = dists[0][0]
37
-
38
- host_info[virus] = {
39
- best: best_host,
40
- all: dists
41
- }
42
- end
43
- end
44
-
45
- host_info
46
- end
47
- end
48
-
49
- class Runners
50
-
51
- # Runs the WIsH program
52
- #
53
- # @raise [AbortIf::Exit] if commands fail
54
- def self.wish exe, vir_dir, host_dir, outdir, threads
55
- model_dir = File.join outdir, "model"
56
-
57
- FileUtils.mkdir_p model_dir
58
-
59
- build_model = "#{exe} " \
60
- "-t #{threads} " \
61
- "-c build " \
62
- "-g #{host_dir} " \
63
- "-m #{model_dir}"
64
-
65
- predict = "#{exe} " \
66
- "-t #{threads} " \
67
- "-c predict " \
68
- "-g #{vir_dir} " \
69
- "-m #{model_dir} " \
70
- "-r #{outdir} -b"
71
-
72
- Process.run_and_time_it! "Building model", build_model
73
- Process.run_and_time_it! "Predicting host", predict
74
-
75
- FileUtils.rm_r model_dir if Dir.exist? model_dir
76
- end
77
-
78
- def self.vir_host_matcher exe, vir_dir, host_dir, outdir
79
- FileUtils.mkdir_p outdir
80
-
81
- cmd = "python #{exe} " \
82
- "-v #{vir_dir} " \
83
- "-b #{host_dir} " \
84
- "-o #{outdir} " \
85
- "-d 1" # only compute d2star dissimilarity
86
-
87
- Process.run_and_time_it! "Computing d2star dissimilarity", cmd
88
-
89
- tmp_dir = File.join outdir, "tmp"
90
- FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
91
-
92
- bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
93
- bad_files.each do |fname|
94
- path = File.join outdir, fname
95
-
96
- FileUtils.rm path if File.exist? path
97
- end
98
- end
99
- end
23
+ WISH = File.join BIN, "WIsH"
24
+ VHM = File.join BIN, "vhm.py"
25
+ MUMMER = File.join BIN, "mummer"
26
+ RSCRIPT = "Rscript"
100
27
  end
Binary file
@@ -46,8 +46,9 @@ if not os.path.exists(tmpDir) :
46
46
  filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
47
47
 
48
48
  ## name length ##
49
- nameLen = 93 - len(options.outDir)
49
+ # nameLen = 93 - len(options.outDir)
50
50
  #### possibly because of the kmercount folder name for each contig is too long?
51
+ nameLen = 99999999999999
51
52
 
52
53
 
53
54
  #################### 0: preparation ############################
@@ -211,7 +212,7 @@ if options.hostTaxaFile is None :
211
212
  hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
212
213
  hostTaxaFileWrite.close()
213
214
  hostTaxaFileWrite = open(hostTaxaFile, 'a')
214
-
215
+
215
216
  hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
216
217
  for currentFileName in hostFaList :
217
218
  if currentFileName.startswith('.') :
@@ -235,7 +236,7 @@ else :
235
236
  hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
236
237
  hostTaxaTable[hostTaxaTable=='']='unknown'
237
238
  numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
238
-
239
+
239
240
  filelog.flush()
240
241
 
241
242
  #################### 1: count kmer and prepare list files ############################
@@ -259,7 +260,7 @@ for currentFileName in virusFaList :
259
260
  filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
260
261
  for w in range(1, (kmax+1)) :
261
262
  currentFilePath = os.path.join(options.virusFaDir, currentFileName)
262
-
263
+
263
264
  currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
264
265
  cmdKmer = countKmerOut + " -l -k " + str(w) + \
265
266
  " -i " + currentFilePath +\
@@ -278,9 +279,9 @@ for currentFileName in virusFaList :
278
279
  sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
279
280
  filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
280
281
  sys.exit(0)
281
-
282
+
282
283
  filelog.flush()
283
-
284
+
284
285
  end_time = time.time()
285
286
  count += 1
286
287
  #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
@@ -329,9 +330,9 @@ for currentFileName in hostFaList :
329
330
  sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
330
331
  filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
331
332
  sys.exit(0)
332
-
333
+
333
334
  filelog.flush()
334
-
335
+
335
336
  end_time = time.time()
336
337
  count += 1
337
338
  #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
@@ -366,7 +367,7 @@ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
366
367
  sys.stdout.write(c.decode("utf-8"))
367
368
  filelog.write(c.decode("utf-8"))
368
369
  filelog.flush()
369
-
370
+
370
371
  end_time = time.time()
371
372
  count += 1
372
373
  sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-28 00:00:00.000000000 Z
11
+ date: 2018-07-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,24 +52,66 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: parse_fasta
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.5'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 2.5.2
65
+ type: :runtime
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '2.5'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 2.5.2
55
75
  - !ruby/object:Gem::Dependency
56
76
  name: rya
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: 0.1.3
81
+ version: 0.4.0
62
82
  type: :runtime
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: 0.1.3
88
+ version: 0.4.0
89
+ - !ruby/object:Gem::Dependency
90
+ name: trollop
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '2.1'
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: 2.1.3
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - "~>"
104
+ - !ruby/object:Gem::Version
105
+ version: '2.1'
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: 2.1.3
69
109
  description: Viral host discovery pipeline.
70
110
  email:
71
111
  - moorer@udel.edu
72
- executables: []
112
+ executables:
113
+ - big_simon
114
+ - ranks
73
115
  extensions: []
74
116
  extra_rdoc_files: []
75
117
  files:
@@ -80,17 +122,25 @@ files:
80
122
  - COPYING
81
123
  - Gemfile
82
124
  - Gemfile.lock
125
+ - Makefile
83
126
  - README.md
84
127
  - Rakefile
85
128
  - big_simon.gemspec
86
129
  - bin/console
87
130
  - bin/setup
131
+ - exe/big_simon
132
+ - exe/ranks
88
133
  - lib/big_simon.rb
134
+ - lib/big_simon/parsers.rb
135
+ - lib/big_simon/pipeline.rb
136
+ - lib/big_simon/runners.rb
137
+ - lib/big_simon/utils.rb
89
138
  - lib/big_simon/version.rb
90
139
  - vendor/bin/mac/WIsH
91
140
  - vendor/bin/mac/computeMeasure.out
92
141
  - vendor/bin/mac/computeMeasure_onlyd2star.out
93
142
  - vendor/bin/mac/countKmer.out
143
+ - vendor/bin/mac/mummer
94
144
  - vendor/bin/mac/vhm.py
95
145
  homepage: https://github.com/mooreryan/big_simon
96
146
  licenses: []