big_simon 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 544705d81f538f636b61b014d4fa84d0a0b68918
4
- data.tar.gz: e8943460d42bca98943fc27ae7919601cefcbe1a
3
+ metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
4
+ data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
5
5
  SHA512:
6
- metadata.gz: 8e0c2f5be8f1dbcdaa0149788db1fe05c2cdc7e5c0988d20311f8cf4a37aa3c0869d271704f606a00cab7d7bfb03e81a07f89bce52b92b933495fc62efdd26ac
7
- data.tar.gz: e50a57bb282bb20274cb49673b657574b277d61af80c0a510caa6938fc5699eb368d3bcc133d9759d53dab1f6f57c3bd11c9c537227444bf6b6f7602b6f59bb4
6
+ metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
7
+ data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
data/.gitignore CHANGED
@@ -18,4 +18,16 @@
18
18
 
19
19
  vendor/repos
20
20
 
21
- test_files/test_output
21
+ test_files/test_output
22
+
23
+ 0000TEST
24
+
25
+ apple.txt
26
+ stats.r
27
+ toyexample_out/
28
+
29
+ spec/test_files/outdir_for_heatmaps/outdir/
30
+
31
+ stats.tmp.r
32
+
33
+ scratch
data/Gemfile.lock CHANGED
@@ -1,14 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.0.1)
5
- rya (~> 0.1.3)
4
+ big_simon (0.1.0)
5
+ parse_fasta (~> 2.5, >= 2.5.2)
6
+ rya (~> 0.4.0)
7
+ trollop (~> 2.1, >= 2.1.3)
6
8
 
7
9
  GEM
8
10
  remote: https://rubygems.org/
9
11
  specs:
10
12
  abort_if (0.2.0)
11
13
  diff-lcs (1.3)
14
+ parse_fasta (2.5.2)
12
15
  rake (10.5.0)
13
16
  rspec (3.7.0)
14
17
  rspec-core (~> 3.7.0)
@@ -23,10 +26,11 @@ GEM
23
26
  diff-lcs (>= 1.2.0, < 2.0)
24
27
  rspec-support (~> 3.7.0)
25
28
  rspec-support (3.7.1)
26
- rya (0.1.3)
29
+ rya (0.4.0)
27
30
  abort_if (~> 0.2.0)
28
31
  systemu (~> 2.6, >= 2.6.5)
29
32
  systemu (2.6.5)
33
+ trollop (2.1.3)
30
34
 
31
35
  PLATFORMS
32
36
  ruby
data/Makefile ADDED
@@ -0,0 +1,8 @@
1
+ test_small:
2
+ rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
3
+
4
+ test_small_install:
5
+ rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
6
+
7
+ test_toy:
8
+ rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # BigSimon
2
2
 
3
- A pipeline for finding hosts of viruses!
3
+ Hi, I'm BigSimon (but you can call me BigSi if you want), and I'm a pipeline for finding hosts of viruses!
4
+
5
+ Mainly, I'm just a wrapper for some other nice tools.
4
6
 
5
7
  ## Installation
6
8
 
@@ -20,7 +22,11 @@ Or install it yourself as:
20
22
 
21
23
  ## Usage
22
24
 
23
- TODO
25
+ For now, there is not much documentation. To see the help file, run:
26
+
27
+ ```
28
+ $ big_simon --help
29
+ ```
24
30
 
25
31
  ## Development
26
32
 
data/big_simon.gemspec CHANGED
@@ -26,5 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
28
 
29
- spec.add_runtime_dependency "rya", "~> 0.1.3"
29
+ spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
30
+ spec.add_runtime_dependency "rya", "~> 0.4.0"
31
+ spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
30
32
  end
data/exe/big_simon ADDED
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env ruby
2
+ Signal.trap("PIPE", "EXIT")
3
+
4
+ require "pp"
5
+ require "tempfile"
6
+
7
+ require "parse_fasta"
8
+ require "trollop"
9
+
10
+ require "big_simon"
11
+
12
+ # TODO make scaled scores with high score being better.
13
+
14
+ Process.extend Rya::CoreExtensions::Process
15
+
16
+ opts = Trollop.options do
17
+ version BigSimon::VERSION_BANNER
18
+
19
+ banner <<-EOS
20
+
21
+ #{BigSimon::VERSION_BANNER}
22
+
23
+ Hi, I'm BigSimon! I'm here to help you figure out the hosts for
24
+ your viruses.
25
+
26
+ I run a bunch of different programs. In addition to doing some
27
+ merging of results, I'll give you heatmaps for all the programs and
28
+ you can check for yourself.
29
+
30
+ The scaled scores run from 0 to 1 with lower scores being better.
31
+
32
+ Options:
33
+ EOS
34
+
35
+ opt :viruses, "Path to fasta file(s) with viruses", type: :strings
36
+ opt :hosts, "Path to fasta file(s) with hosts", type: :strings
37
+ opt :outdir, "Output directory", default: "big_simon"
38
+ opt :threads, "Number of threads to use", default: 1
39
+ end
40
+
41
+ Rya::AbortIf.logger.debug { "Command line opts: #{opts.inspect}" }
42
+
43
+ BigSimon::Utils.check_opt! opts, :viruses
44
+ BigSimon::Utils.check_opt! opts, :hosts
45
+
46
+ # Check infiles
47
+ [opts[:viruses], opts[:hosts]].flatten.each do |fname|
48
+ BigSimon::Utils.check_file! fname
49
+ end
50
+
51
+ Rya::AbortIf.abort_unless opts[:threads] > 0,
52
+ "--threads must be > 0"
53
+
54
+ programs = [
55
+ "WIsH",
56
+ "VirHostMatcher",
57
+ "mummer"
58
+ ]
59
+
60
+ outdir = opts[:outdir]
61
+ threads = opts[:threads]
62
+ virus_fnames = opts[:viruses]
63
+ host_fnames = opts[:hosts]
64
+
65
+ FileUtils.mkdir_p outdir
66
+
67
+ tmpdir = File.join opts[:outdir], "big_simon_tmp"
68
+ tmpdir_virus = File.join tmpdir, "virus"
69
+ tmpdir_host = File.join tmpdir, "host"
70
+
71
+ # all_predictions_fname = File.join outdir, "scores_all.txt"
72
+ mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
+
74
+ virus_recs, host_recs = [], []
75
+
76
+ # Tempfile.open do |vir_f|
77
+ # Tempfile.open do |host_f|
78
+ # virus_fnames.each do |fname|
79
+ # ParseFasta::SeqFile.open(fname).each_record do |rec|
80
+ # vir_f.puts rec
81
+ #
82
+ # vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
83
+ # end
84
+ # end
85
+ #
86
+ # host_fnames.each do |fname|
87
+ # ParseFasta::SeqFile.open(fname).each_record do |rec|
88
+ # host_f.puts rec
89
+ # host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
90
+ # end
91
+ # end
92
+ #
93
+ # vir_f.fsync
94
+ # host_f.fsync
95
+ #
96
+ # cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
97
+ # Process.run_and_time_it! "MUMMER", cmd
98
+ # end
99
+ # end
100
+ #
101
+ # header = nil
102
+ # hits = []
103
+ # hit_info = {}
104
+ # virus = nil
105
+ #
106
+ # File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
107
+ # if line.start_with? '>'
108
+ # virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
109
+ #
110
+ # unless hit_info.has_key? virus
111
+ # hit_info[virus] = {}
112
+ # end
113
+ # else
114
+ # host, _, _, len = line.chomp.strip.split(" ")
115
+ # host = host.sub(/___reverse$/, "").strip
116
+ #
117
+ # unless hit_info[virus].has_key? host
118
+ # hit_info[virus][host] = -1
119
+ # end
120
+ #
121
+ # hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
122
+ # end
123
+ # end
124
+ #
125
+ # puts
126
+ #
127
+ # hh = hit_info.map do |virus, info|
128
+ # [virus, info.to_a.sort_by {|gen, len| len}.reverse]
129
+ # end
130
+ #
131
+ # pp hh
132
+
133
+ # hh = hit_info.map do |virus, info|
134
+ # [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
135
+ #
136
+ # end
137
+ # p hit_info
138
+
139
+ scores_files = {}
140
+ programs.each do |program|
141
+ raw_fname = File.join outdir, "scores_raw.#{program}.txt"
142
+ scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
143
+
144
+ scores_files[program] = {
145
+ raw: File.open(raw_fname, "w"),
146
+ scaled: File.open(scaled_fname, "w")
147
+ }
148
+ end
149
+
150
+ scores_files.each do |program, files|
151
+ files.each do |name, file|
152
+ file.puts %w[virus host score].join "\t"
153
+ end
154
+ end
155
+
156
+ name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
157
+ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
158
+
159
+ wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
160
+ vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
161
+
162
+ host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
163
+
164
+ # Map them back to simple names. TODO just have it spit these out from the beginning.
165
+ host_info_mummer_simple_names = {}
166
+ inverted_name_map_virus = name_map_virus.invert
167
+ inverted_name_map_host = name_map_host.invert
168
+
169
+ host_info_mummer.each do |virus, host_tables|
170
+ virname = virus
171
+ if inverted_name_map_virus.has_key? virus
172
+ virname = inverted_name_map_virus[virus]
173
+ end
174
+
175
+ host_info_mummer_simple_names[virname] = []
176
+
177
+ host_tables.map do |table|
178
+ hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
179
+ new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
180
+
181
+ host_info_mummer_simple_names[virname] << new_table
182
+ end
183
+ end
184
+
185
+ host_info_wish = BigSimon::Parsers.wish wish_outf
186
+ host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
187
+
188
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
189
+ host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
190
+
191
+ # puts
192
+ # pp host_info
193
+ # puts
194
+
195
+ # Just a basic all info file
196
+ # File.open all_predictions_fname, "w" do |f|
197
+ # f.puts %w[virus host program score scaled.score].join "\t"
198
+
199
+ host_info.each do |virus, h1|
200
+ h1.each do |host, h2|
201
+ lines = {}
202
+
203
+ h2[:scores].each do |program, score|
204
+ lines[[virus, host, program]] = [score]
205
+
206
+ scores_files[program][:raw].puts [virus, host, score].join "\t"
207
+ end
208
+
209
+ # Add in the scaled score too.
210
+ h2[:scaled_scores].each do |program, score|
211
+ lines[[virus, host, program]] << score
212
+
213
+ scores_files[program][:scaled].puts [virus, host, score].join "\t"
214
+ end
215
+
216
+ # lines.each do |(virus, host, program), (score, scaled_score)|
217
+ # f.puts [virus, host, program, score, scaled_score].join "\t"
218
+ # end
219
+ end
220
+ end
221
+ # end
222
+
223
+ # A file with mean scaled scores.
224
+ File.open mean_scaled_scores_fname, "w" do |f|
225
+ f.puts %w[virus host score].join "\t"
226
+
227
+ host_info.each do |virus, h1|
228
+ h1.each do |host, h2|
229
+ scaled_scores = h2[:scaled_scores].values
230
+
231
+ mean_scaled_score = scaled_scores.reduce(:+) / scaled_scores.length.to_f
232
+
233
+ f.puts [virus, host, mean_scaled_score].join "\t"
234
+ end
235
+ end
236
+ end
237
+
238
+ scores_files.each do |program, file|
239
+ file.values.map(&:close)
240
+ end
241
+ FileUtils.rm_r tmpdir
242
+
243
+ # Make the heatmaps
244
+ BigSimon::Runners.heatmaps BigSimon::RSCRIPT, outdir, File.join(outdir, "heatmaps")
data/exe/ranks ADDED
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ Signal.trap("PIPE", "EXIT")
3
+
4
+ require "pp"
5
+
6
+ by_program = {}
7
+ lines = []
8
+
9
+ TOP = (ENV["TOP"] || 5).to_i
10
+
11
+ ARGV.each do |fname|
12
+ scores = {}
13
+ File.open(fname, "rt").each_line.with_index do |line, idx|
14
+ unless idx.zero?
15
+ virus, host, score = line.chomp.split "\t"
16
+
17
+ unless scores.has_key? virus
18
+ scores[virus] = []
19
+ end
20
+
21
+ scores[virus] << [host, score.to_f]
22
+ end
23
+ end
24
+
25
+ scores.sort_by { |virus, _| virus }.each do |virus, host_scores|
26
+ # Lowest score is the best
27
+ top_5 = host_scores.sort_by { |host, score| score }.take(TOP).map(&:first)
28
+
29
+ line = [File.basename(fname), virus, top_5]
30
+ lines << line
31
+
32
+ # puts line.join "\t"
33
+ end
34
+ # puts
35
+ end
36
+
37
+ lines.each do |line|
38
+ program, virus, all = line
39
+ first = all.first
40
+
41
+ unless by_program.has_key? virus
42
+ by_program[virus] = {}
43
+ end
44
+
45
+ unless program == "scores_scaled.mean.txt"
46
+ by_program[virus][program] = { first: first, all: all }
47
+ end
48
+ end
49
+
50
+ # These track the number of times a host shows up in the first spot and in the top N spots for that virus for all programs.
51
+ first_table = {}
52
+ top_host_table = {}
53
+
54
+ by_program.each do |virus, program_tables|
55
+ first_table[virus] = {}
56
+ top_host_table[virus] = {}
57
+
58
+ program_tables.each do |program, top_info|
59
+
60
+
61
+ first_host = top_info[:first]
62
+ all_top = top_info[:all]
63
+
64
+ unless first_table[virus].has_key? first_host
65
+ first_table[virus][first_host] = []
66
+ end
67
+
68
+
69
+ first_table[virus][first_host] << program
70
+
71
+ all_top.each do |top_host|
72
+ unless top_host_table[virus].has_key? top_host
73
+ top_host_table[virus][top_host] = []
74
+ end
75
+
76
+ top_host_table[virus][top_host] << program
77
+ end
78
+ end
79
+ end
80
+
81
+ first_table.each do |virus, host_counts|
82
+ host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
83
+ STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
84
+ end
85
+ STDERR.puts
86
+ end
87
+
88
+ top_host_table.each do |virus, host_counts|
89
+ host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
90
+ puts [virus, :top_N, host, programs.count, programs].join "\t"
91
+ end
92
+ puts
93
+ end
@@ -0,0 +1,78 @@
1
+ module BigSimon
2
+ # Methods for parsing output files
3
+ class Parsers
4
+
5
+ # @note VirHostMatcher returns true distances that run from 0 to 1, so it doesn't need scaling.
6
+ # @note VirHostMatcher includes the whole file name as the id of the organism, so we chop off some common endings.
7
+ def self.vir_host_matcher fname
8
+ hosts = nil
9
+
10
+ host_info = {}
11
+ File.open(fname, "rt").each_line.with_index do |line, idx|
12
+ line.chomp!
13
+ line.sub! /,$/, "" # git rid of trailing commas
14
+
15
+ if idx.zero?
16
+ stat, *hosts = line.split ","
17
+
18
+ hosts.map! { |str| BigSimon::Utils.strip_suffix str }
19
+ else
20
+ ary = line.split ","
21
+ virus = BigSimon::Utils.strip_suffix ary.shift
22
+
23
+ # In this case the best value is the lowest distance.
24
+ dists = ary.map.
25
+ with_index do |dist, idx|
26
+ { host: hosts[idx], score: dist.to_f, scaled_score: dist.to_f }
27
+ end.sort_by { |ht| ht[:scaled_score] }
28
+
29
+
30
+ host_info[virus] = dists
31
+ end
32
+ end
33
+
34
+ host_info
35
+ end
36
+
37
+ # @note WIsH gives log likelihoods so the scaled value is actually scaled.
38
+ # @note The viruses and hosts will have the ID rather than the file name.
39
+ def self.wish fname
40
+ viruses = nil
41
+
42
+ host_info = {}
43
+
44
+ hosts = nil
45
+ File.open(fname, "rt").each_line.map.with_index do |line, idx|
46
+ line.chomp!
47
+
48
+ if idx.zero?
49
+ ary = line.split("\t")
50
+ ary.unshift("")
51
+ else
52
+ ary = line.split("\t")
53
+ end
54
+ end.transpose.each_with_index do |line_ary, idx|
55
+ if idx.zero?
56
+ hosts = line_ary.drop(1)
57
+ else
58
+ virus = line_ary.shift
59
+
60
+ scores = line_ary.map(&:to_f)
61
+
62
+ host_vals = scores.map.with_index do |score, idx|
63
+ { host: hosts[idx], score: score, scaled_score: 1 - Math.exp(score) }
64
+ end
65
+
66
+ host_info[virus] = host_vals
67
+ end
68
+
69
+ host_info.each do |virus, hosts|
70
+ hosts.sort_by! { |ht| ht[:scaled_score] }
71
+ end
72
+ end
73
+
74
+ host_info
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,64 @@
1
+ module BigSimon
2
+ class Pipeline
3
+ # @param collated_results_table { virus => host => score_type => program => score }
4
+ def self.map_taxa collated_results_table, virus_name_map, host_name_map
5
+ new_results_table = {}
6
+
7
+ collated_results_table.each do |virus_name, host_table|
8
+ if virus_name_map.include? virus_name
9
+ new_virus_name = virus_name_map[virus_name]
10
+ else
11
+ new_virus_name = virus_name
12
+ end
13
+
14
+ new_results_table[new_virus_name] = {}
15
+
16
+ host_table.each do |host_name, score_table|
17
+ if host_name_map.include? host_name
18
+ new_host_name = host_name_map[host_name]
19
+ else
20
+ new_host_name = host_name
21
+ end
22
+
23
+ new_results_table[new_virus_name][new_host_name] = score_table
24
+ end
25
+ end
26
+
27
+ new_results_table
28
+ end
29
+
30
+ # @param [Array<Hash>] results_table host info hash tables. See functions in Parsers class.
31
+ # @param [Array<String>] programs names of programs generating hash tables (in same order as host_data)
32
+ def self.collate_host_results results_table, programs
33
+ Rya::AbortIf.assert results_table.count == programs.count
34
+
35
+ virus_host_scores = {}
36
+ all_viruses = results_table.reduce(Set.new) { |acc, ht| acc + ht.keys }
37
+
38
+ all_viruses.each do |virus|
39
+ virus_host_scores[virus] = {}
40
+ end
41
+
42
+ results_table.each_with_index do |ht, idx|
43
+ program = programs[idx]
44
+
45
+ ht.each do |virus, host_scores|
46
+ host_scores.each do |ht|
47
+ host = ht[:host]
48
+ score = ht[:score]
49
+ scaled_score = ht[:scaled_score]
50
+
51
+ unless virus_host_scores[virus].has_key? host
52
+ virus_host_scores[virus][host] = { scores: {}, scaled_scores: {}}
53
+ end
54
+
55
+ virus_host_scores[virus][host][:scores][program] = score
56
+ virus_host_scores[virus][host][:scaled_scores][program] = scaled_score
57
+ end
58
+ end
59
+ end
60
+
61
+ virus_host_scores
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,189 @@
1
+ require "tempfile"
2
+
3
+ module BigSimon
4
+ class Runners
5
+
6
+ # This one's a bit different as it parses as well and returns original names.
7
+ # @todo Also do the reverse of each genome in case it's a contig.
8
+ def self.mummer exe, vir_dir, host_dir, outdir, threads
9
+ klass = Class.new.extend Rya::CoreExtensions::Math
10
+ FileUtils.mkdir_p outdir
11
+
12
+ # TODO put these all in one file then do it?
13
+
14
+ results = {}
15
+
16
+ # Takes names in files and puts them to the file names
17
+ name_map = {}
18
+
19
+ Dir.glob(vir_dir + "/*").each do |vir_fname|
20
+ this_virus_scores = []
21
+ virus = nil
22
+
23
+ Dir.glob(host_dir + "/*").each do |host_fname|
24
+ vir_base = File.basename vir_fname
25
+ host_base = File.basename host_fname
26
+ outfname = File.join outdir, "#{vir_base}___#{host_base}.mummer"
27
+
28
+ # -l is min length of a match TODO pull this into a const
29
+ # -F to force 4 columns
30
+ cmd = "#{exe} -F " \
31
+ "-maxmatch " \
32
+ "-l 15 " \
33
+ "#{host_fname} " \
34
+ "#{vir_fname} " \
35
+ "> #{outfname}"
36
+
37
+ Process.run_and_time_it! "Calculating matches", cmd
38
+
39
+ # Note there should only be one '>' per file here.
40
+ host = nil
41
+ score = 0
42
+ File.open(outfname, "rt").each_line.with_index do |line, idx|
43
+ if idx.zero?
44
+ this_virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
45
+
46
+ Rya::AbortIf::abort_unless(this_virus == virus, "OOPS") if virus
47
+
48
+ virus ||= this_virus
49
+ else
50
+ ary = line.chomp.strip.split(" ")
51
+ Rya::AbortIf.abort_unless ary.count == 4, "Problem parsing #{outfname} (mummer output)"
52
+
53
+ host = ary[0].sub(/___reverse$/, "").strip
54
+ len = ary[3].to_i
55
+
56
+ score = len if len > score
57
+ end
58
+ end
59
+
60
+ this_virus_scores << score
61
+
62
+ unless results.has_key? virus
63
+ results[virus] = []
64
+ end
65
+
66
+ results[virus] << { host: host, score: score, scaled_score: nil }
67
+
68
+ FileUtils.rm outfname
69
+ end
70
+
71
+ # This was the original scaling, i.e. per virus
72
+ # min = 0 # this_virus_scores.min # Technically, this should range from 0 to 15. Any data missing from this table would give a zero. TODO we don't actually account for this though.
73
+ # max = this_virus_scores.max
74
+ # from = 1
75
+ # to = 0
76
+ #
77
+ # results[virus].each do |host_table|
78
+ # host_table[:scaled_score] = klass.scale host_table[:score], min, max, from, to
79
+ # end
80
+ end
81
+
82
+ all_scores = []
83
+ results.each do |virus, host_tables|
84
+ all_scores << host_tables.map { |table| table[:score] }
85
+ end
86
+
87
+ all_scores.flatten!
88
+ max = all_scores.max
89
+
90
+ results.each do |virus, host_tables|
91
+ host_tables.each do |host_table|
92
+ host_table[:scaled_score] = klass.scale host_table[:score], 0, max, 1, 0
93
+ end
94
+ end
95
+
96
+ results
97
+ end
98
+
99
+ def self.vir_host_matcher exe, vir_dir, host_dir, outdir
100
+ FileUtils.mkdir_p outdir
101
+
102
+ cmd = "python #{exe} " \
103
+ "-v #{vir_dir} " \
104
+ "-b #{host_dir} " \
105
+ "-o #{outdir} " \
106
+ "-d 1" # only compute d2star dissimilarity
107
+
108
+ Process.run_and_time_it! "Computing d2star dissimilarity", cmd
109
+
110
+ tmp_dir = File.join outdir, "tmp"
111
+ FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
112
+
113
+ bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
114
+ bad_files.each do |fname|
115
+ path = File.join outdir, fname
116
+
117
+ FileUtils.rm path if File.exist? path
118
+ end
119
+
120
+ outf = File.join outdir, "d2star_k6.csv"
121
+ new_outf = File.join outdir, "vir_host_matcher.txt"
122
+ FileUtils.mv outf, new_outf
123
+
124
+ new_outf
125
+ end
126
+
127
+ # Runs the WIsH program
128
+ #
129
+ # @raise [AbortIf::Exit] if commands fail
130
+ def self.wish exe, vir_dir, host_dir, outdir, threads
131
+ model_dir = File.join outdir, "model"
132
+
133
+ FileUtils.mkdir_p model_dir
134
+
135
+ build_model = "#{exe} " \
136
+ "-t #{threads} " \
137
+ "-c build " \
138
+ "-g #{host_dir} " \
139
+ "-m #{model_dir}"
140
+
141
+ predict = "#{exe} " \
142
+ "-t #{threads} " \
143
+ "-c predict " \
144
+ "-g #{vir_dir} " \
145
+ "-m #{model_dir} " \
146
+ "-r #{outdir}"
147
+
148
+ Process.run_and_time_it! "Building model", build_model
149
+ Process.run_and_time_it! "Predicting host", predict
150
+
151
+ FileUtils.rm_r model_dir if Dir.exist? model_dir
152
+
153
+ outf = File.join outdir, "llikelihood.matrix"
154
+ new_outf = File.join outdir, "wish.txt"
155
+ FileUtils.mv outf, new_outf
156
+
157
+ new_outf
158
+ end
159
+
160
+ def self.heatmaps exe, indir, outdir
161
+ FileUtils.mkdir_p outdir
162
+
163
+ fnames = Dir.glob("#{indir}/scores*.txt").map do |in_fname|
164
+ extname = File.extname in_fname
165
+ basename = File.basename in_fname, extname
166
+
167
+ out_fname = File.join outdir, "#{basename}.heatmap.pdf"
168
+
169
+ [in_fname, out_fname]
170
+ end
171
+
172
+
173
+ rcode_str = BigSimon::Utils.rcode fnames
174
+
175
+ Object::Tempfile.open do |f|
176
+ f.puts rcode_str
177
+ f.fsync # ensure no data is buffered
178
+
179
+
180
+ cmd = "#{exe} #{f.path}"
181
+ Process.run_and_time_it! "Drawing heatmaps", cmd
182
+ end
183
+
184
+ out_fnames = fnames.map(&:last)
185
+ end
186
+ end
187
+ end
188
+
189
+
@@ -0,0 +1,108 @@
1
+ module BigSimon
2
+ # @todo These don't have unit tests yet.
3
+ # @note Skips any duplicate IDs. Only keeps the first one.
4
+ class Utils
5
+ def self.check_file! fname
6
+ Rya::AbortIf.abort_if fname && !File.exist?(fname),
7
+ "#{fname} doesn't exist! Try big_simon --help for help."
8
+ end
9
+
10
+ def self.check_opt! opts, arg
11
+ Rya::AbortIf.abort_unless opts.send(:fetch, "#{arg}_given".to_sym),
12
+ "You must specify --#{arg.to_s.tr('_', '-')}. Try big_simon --help for help."
13
+ end
14
+
15
+ def self.rcode fnames
16
+ functions = %Q|
17
+ library(reshape2)
18
+ library(gplots)
19
+ library(RColorBrewer)
20
+
21
+ file.join <- function(...) {
22
+ paste(..., sep="/")
23
+ }
24
+
25
+ draw.heatmap <- function(infname, outfname) {
26
+ dat <- read.table(infname, header=T, sep="\t")
27
+
28
+ wide.dat <- dcast(dat, host ~ virus, value.var="score")
29
+
30
+ hosts <- wide.dat[, 1]
31
+ scores <- wide.dat[, 2:ncol(wide.dat)]
32
+ scores.numeric <- apply(scores, 2, as.numeric)
33
+
34
+ scores.matrix <- as.matrix(scores.numeric)
35
+
36
+ rownames(scores.matrix) <- hosts
37
+
38
+ palette <- "YlOrBr"
39
+ col <- colorRampPalette(brewer.pal(n=9, palette))(n = 25)
40
+ size <- 0.75
41
+
42
+ pdf(outfname, height=5, width=8)
43
+
44
+ heatmap.2(scores.matrix,
45
+ trace="none", ## Disable those wonky lines.
46
+ col=col, ## Set the color.
47
+
48
+ ## Size opts
49
+ margins=c(11, 11), cexRow=size, cexCol=size,
50
+
51
+ ## Key labeling
52
+ key.xlab="Score")
53
+
54
+ invisible(dev.off())
55
+ }
56
+
57
+ |
58
+
59
+ drawing = fnames.map do |in_fname, out_fname|
60
+ %Q{
61
+
62
+ draw.heatmap("#{in_fname}", "#{out_fname}")
63
+ }
64
+ end.join
65
+
66
+ [functions, drawing].join "\n"
67
+ end
68
+
69
+ def self.scale_log_likelihood ll
70
+ 1 - Math.exp(ll)
71
+ end
72
+
73
+ def self.set_up_tmp_dirs fastas, tmpdir, which
74
+ Object::FileUtils.mkdir_p tmpdir
75
+
76
+ name_map = {}
77
+ all_ids = Set.new
78
+
79
+ seq_num = -1
80
+ fastas.each do |fname|
81
+ ParseFasta::SeqFile.open(fname).each_record do |rec|
82
+ if all_ids.include? rec.id
83
+ Rya::AbortIf.logger.warn { "#{rec.id} was seen more than one time! Duplicate organism IDs are not allowed, so we will only keep the first one." }
84
+ else
85
+ all_ids << rec.id
86
+
87
+ seq_num += 1
88
+
89
+ new_id = "#{which}_#{seq_num}"
90
+ name_map[new_id] = rec.id
91
+
92
+ outfname = File.join tmpdir, "#{new_id}.fa"
93
+
94
+ File.open(outfname, "w") do |f|
95
+ f.puts rec
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ [name_map, all_ids]
102
+ end
103
+
104
+ def self.strip_suffix fname
105
+ fname.sub /.fasta$|.fa$/, ""
106
+ end
107
+ end
108
+ end
@@ -1,3 +1,15 @@
1
1
  module BigSimon
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
+
4
+ COPYRIGHT = "2018 Ryan Moore"
5
+ CONTACT = "moorer@udel.edu"
6
+ WEBSITE = "https://github.com/mooreryan/InteinFinder"
7
+ LICENSE = "GPLv3"
8
+
9
+ VERSION_BANNER =
10
+ " # Version: v#{VERSION}
11
+ # Copyright: #{COPYRIGHT}
12
+ # Contact: #{CONTACT}
13
+ # License: #{LICENSE}"
14
+
3
15
  end
data/lib/big_simon.rb CHANGED
@@ -1,100 +1,27 @@
1
1
  require "rya"
2
+ require "set"
3
+ require "pp"
2
4
 
3
5
  require "big_simon/version"
4
6
 
7
+ require "big_simon/utils"
8
+
9
+ require "big_simon/runners"
10
+ require "big_simon/parsers"
11
+ require "big_simon/pipeline"
12
+
5
13
  Time.extend Rya::CoreExtensions::Time
6
14
  Process.extend Rya::CoreExtensions::Process
15
+ Array.include Rya::CoreExtensions::Array
16
+ Math.extend Rya::CoreExtensions::Math
7
17
 
8
18
  module BigSimon
9
-
10
- # Project directories
11
19
  ROOT = File.join __dir__, ".."
12
20
  BIN = File.join ROOT, "vendor", "bin", "mac"
13
21
  SPEC = File.join ROOT, "spec"
14
22
  TEST_FILES = File.join SPEC, "test_files"
15
-
16
- class Parsers
17
-
18
- def self.vir_host_matcher fname
19
- hosts = nil
20
-
21
- host_info = {}
22
- File.open(fname, "rt").each_line.with_index do |line, idx|
23
- line.chomp!
24
- line.sub! /,$/, "" # git rid of trailing commas
25
-
26
- if idx.zero?
27
- stat, *hosts = line.split ","
28
- else
29
- ary = line.split ","
30
- virus = ary.shift
31
-
32
- dists = ary.map.
33
- with_index { |dist, idx| [hosts[idx], dist.to_f] }.
34
- sort_by { |_, dist| dist }
35
-
36
- best_host = dists[0][0]
37
-
38
- host_info[virus] = {
39
- best: best_host,
40
- all: dists
41
- }
42
- end
43
- end
44
-
45
- host_info
46
- end
47
- end
48
-
49
- class Runners
50
-
51
- # Runs the WIsH program
52
- #
53
- # @raise [AbortIf::Exit] if commands fail
54
- def self.wish exe, vir_dir, host_dir, outdir, threads
55
- model_dir = File.join outdir, "model"
56
-
57
- FileUtils.mkdir_p model_dir
58
-
59
- build_model = "#{exe} " \
60
- "-t #{threads} " \
61
- "-c build " \
62
- "-g #{host_dir} " \
63
- "-m #{model_dir}"
64
-
65
- predict = "#{exe} " \
66
- "-t #{threads} " \
67
- "-c predict " \
68
- "-g #{vir_dir} " \
69
- "-m #{model_dir} " \
70
- "-r #{outdir} -b"
71
-
72
- Process.run_and_time_it! "Building model", build_model
73
- Process.run_and_time_it! "Predicting host", predict
74
-
75
- FileUtils.rm_r model_dir if Dir.exist? model_dir
76
- end
77
-
78
- def self.vir_host_matcher exe, vir_dir, host_dir, outdir
79
- FileUtils.mkdir_p outdir
80
-
81
- cmd = "python #{exe} " \
82
- "-v #{vir_dir} " \
83
- "-b #{host_dir} " \
84
- "-o #{outdir} " \
85
- "-d 1" # only compute d2star dissimilarity
86
-
87
- Process.run_and_time_it! "Computing d2star dissimilarity", cmd
88
-
89
- tmp_dir = File.join outdir, "tmp"
90
- FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir
91
-
92
- bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
93
- bad_files.each do |fname|
94
- path = File.join outdir, fname
95
-
96
- FileUtils.rm path if File.exist? path
97
- end
98
- end
99
- end
23
+ WISH = File.join BIN, "WIsH"
24
+ VHM = File.join BIN, "vhm.py"
25
+ MUMMER = File.join BIN, "mummer"
26
+ RSCRIPT = "Rscript"
100
27
  end
Binary file
@@ -46,8 +46,9 @@ if not os.path.exists(tmpDir) :
46
46
  filelog = open(os.path.join(tmpDir, 'vhm.log'), 'w')
47
47
 
48
48
  ## name length ##
49
- nameLen = 93 - len(options.outDir)
49
+ # nameLen = 93 - len(options.outDir)
50
50
  #### possibly because of the kmercount folder name for each contig is too long?
51
+ nameLen = 99999999999999
51
52
 
52
53
 
53
54
  #################### 0: preparation ############################
@@ -211,7 +212,7 @@ if options.hostTaxaFile is None :
211
212
  hostTaxaFileWrite = open(hostTaxaFile, 'w') ## make file blank
212
213
  hostTaxaFileWrite.close()
213
214
  hostTaxaFileWrite = open(hostTaxaFile, 'a')
214
-
215
+
215
216
  hostTaxaFileWrite.write("hostNCBIName hostName hostSuperkingdom hostPhylum hostClass hostOrder hostFamily hostGenus hostSpecies\n")
216
217
  for currentFileName in hostFaList :
217
218
  if currentFileName.startswith('.') :
@@ -235,7 +236,7 @@ else :
235
236
  hostTaxaTable = numpy.genfromtxt(options.hostTaxaFile,delimiter="\t", dtype=str)
236
237
  hostTaxaTable[hostTaxaTable=='']='unknown'
237
238
  numpy.savetxt(hostTaxaFile, hostTaxaTable, fmt="%s", delimiter='\t', newline='\n')
238
-
239
+
239
240
  filelog.flush()
240
241
 
241
242
  #################### 1: count kmer and prepare list files ############################
@@ -259,7 +260,7 @@ for currentFileName in virusFaList :
259
260
  filelog.write("Step 1: counting kmers for virus " + currentFileNameS + "\n")
260
261
  for w in range(1, (kmax+1)) :
261
262
  currentFilePath = os.path.join(options.virusFaDir, currentFileName)
262
-
263
+
263
264
  currentKmerCountPath = os.path.join(kmerCountPath, currentFileNameS)
264
265
  cmdKmer = countKmerOut + " -l -k " + str(w) + \
265
266
  " -i " + currentFilePath +\
@@ -278,9 +279,9 @@ for currentFileName in virusFaList :
278
279
  sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
279
280
  filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
280
281
  sys.exit(0)
281
-
282
+
282
283
  filelog.flush()
283
-
284
+
284
285
  end_time = time.time()
285
286
  count += 1
286
287
  #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
@@ -329,9 +330,9 @@ for currentFileName in hostFaList :
329
330
  sys.stderr.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
330
331
  filelog.write( "ERROR in counting kmers for " + currentFileNameS + "\n")
331
332
  sys.exit(0)
332
-
333
+
333
334
  filelog.flush()
334
-
335
+
335
336
  end_time = time.time()
336
337
  count += 1
337
338
  #sys.stdout.write(str(end_time - start_time) + "s for " + str(count) + " seqs \n")
@@ -366,7 +367,7 @@ for c in iter(cmdCptMeasureOut.stderr.readline, b''):
366
367
  sys.stdout.write(c.decode("utf-8"))
367
368
  filelog.write(c.decode("utf-8"))
368
369
  filelog.flush()
369
-
370
+
370
371
  end_time = time.time()
371
372
  count += 1
372
373
  sys.stdout.write(" (Average time for computing dissimilarities for one virus-host pair: " + str(round((end_time - start_time)/count/len(virusFaList), 4)) + "s) \n")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-28 00:00:00.000000000 Z
11
+ date: 2018-07-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,24 +52,66 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: parse_fasta
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.5'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 2.5.2
65
+ type: :runtime
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '2.5'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 2.5.2
55
75
  - !ruby/object:Gem::Dependency
56
76
  name: rya
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: 0.1.3
81
+ version: 0.4.0
62
82
  type: :runtime
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: 0.1.3
88
+ version: 0.4.0
89
+ - !ruby/object:Gem::Dependency
90
+ name: trollop
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '2.1'
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: 2.1.3
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - "~>"
104
+ - !ruby/object:Gem::Version
105
+ version: '2.1'
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: 2.1.3
69
109
  description: Viral host discovery pipeline.
70
110
  email:
71
111
  - moorer@udel.edu
72
- executables: []
112
+ executables:
113
+ - big_simon
114
+ - ranks
73
115
  extensions: []
74
116
  extra_rdoc_files: []
75
117
  files:
@@ -80,17 +122,25 @@ files:
80
122
  - COPYING
81
123
  - Gemfile
82
124
  - Gemfile.lock
125
+ - Makefile
83
126
  - README.md
84
127
  - Rakefile
85
128
  - big_simon.gemspec
86
129
  - bin/console
87
130
  - bin/setup
131
+ - exe/big_simon
132
+ - exe/ranks
88
133
  - lib/big_simon.rb
134
+ - lib/big_simon/parsers.rb
135
+ - lib/big_simon/pipeline.rb
136
+ - lib/big_simon/runners.rb
137
+ - lib/big_simon/utils.rb
89
138
  - lib/big_simon/version.rb
90
139
  - vendor/bin/mac/WIsH
91
140
  - vendor/bin/mac/computeMeasure.out
92
141
  - vendor/bin/mac/computeMeasure_onlyd2star.out
93
142
  - vendor/bin/mac/countKmer.out
143
+ - vendor/bin/mac/mummer
94
144
  - vendor/bin/mac/vhm.py
95
145
  homepage: https://github.com/mooreryan/big_simon
96
146
  licenses: []