big_simon 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
4
- data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
3
+ metadata.gz: 95ca94aff107e2545dd26ab671c2ed125cb89164
4
+ data.tar.gz: 8cb9098b4462d6702ecd692e7261abb9c1c7c26a
5
5
  SHA512:
6
- metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
7
- data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
6
+ metadata.gz: be890e07e1ca248da8e5b80637b25d00c20185d4710229cc9c72d564abc930d4eb734ad53f42842a479827e64205ee1f53f5691b6527d03b9cc41b9b8656ae85
7
+ data.tar.gz: ac30f597de91303e3b8040ecfde48f98d56c343aadfa3a1085d77fbb55692fb055569734470812144d716e73ee8c94265c732a07462bd8490c2b937572961062
data/.gitignore CHANGED
@@ -30,4 +30,12 @@ spec/test_files/outdir_for_heatmaps/outdir/
30
30
 
31
31
  stats.tmp.r
32
32
 
33
- scratch
33
+ scratch
34
+
35
+
36
+
37
+ spec/test_files/homology_files/output/*homology*
38
+
39
+ pec/test_files/mummer_files/output/mummer_out.txt
40
+
41
+ tmp.prodigal.stdin.5129
data/Gemfile.lock CHANGED
@@ -1,7 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.1.1)
4
+ big_simon (0.2.0)
5
+ parallel
5
6
  parse_fasta (~> 2.5, >= 2.5.2)
6
7
  rya (~> 0.4.0)
7
8
  trollop (~> 2.1, >= 2.1.3)
@@ -11,6 +12,7 @@ GEM
11
12
  specs:
12
13
  abort_if (0.2.0)
13
14
  diff-lcs (1.3)
15
+ parallel (1.12.1)
14
16
  parse_fasta (2.5.2)
15
17
  rake (10.5.0)
16
18
  rspec (3.7.0)
data/big_simon.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
28
 
29
+ spec.add_runtime_dependency "parallel"
29
30
  spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
30
31
  spec.add_runtime_dependency "rya", "~> 0.4.0"
31
32
  spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
data/exe/big_simon CHANGED
@@ -54,7 +54,8 @@ Rya::AbortIf.abort_unless opts[:threads] > 0,
54
54
  programs = [
55
55
  "WIsH",
56
56
  "VirHostMatcher",
57
- "mummer"
57
+ "mummer",
58
+ "homology",
58
59
  ]
59
60
 
60
61
  outdir = opts[:outdir]
@@ -71,87 +72,6 @@ tmpdir_host = File.join tmpdir, "host"
71
72
  # all_predictions_fname = File.join outdir, "scores_all.txt"
72
73
  mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
74
 
74
- # virus_recs, host_recs = [], []
75
-
76
- # Tempfile.open do |vir_f|
77
- # Tempfile.open do |host_f|
78
- # virus_fnames.each do |fname|
79
- # ParseFasta::SeqFile.open(fname).each_record do |rec|
80
- # vir_f.puts rec
81
- #
82
- # vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
83
- # end
84
- # end
85
- #
86
- # host_fnames.each do |fname|
87
- # ParseFasta::SeqFile.open(fname).each_record do |rec|
88
- # host_f.puts rec
89
- # host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
90
- # end
91
- # end
92
- #
93
- # vir_f.fsync
94
- # host_f.fsync
95
- #
96
- # cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
97
- # Process.run_and_time_it! "MUMMER", cmd
98
- # end
99
- # end
100
- #
101
- # header = nil
102
- # hits = []
103
- # hit_info = {}
104
- # virus = nil
105
- #
106
- # File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
107
- # if line.start_with? '>'
108
- # virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
109
- #
110
- # unless hit_info.has_key? virus
111
- # hit_info[virus] = {}
112
- # end
113
- # else
114
- # host, _, _, len = line.chomp.strip.split(" ")
115
- # host = host.sub(/___reverse$/, "").strip
116
- #
117
- # unless hit_info[virus].has_key? host
118
- # hit_info[virus][host] = -1
119
- # end
120
- #
121
- # hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
122
- # end
123
- # end
124
- #
125
- # puts
126
- #
127
- # hh = hit_info.map do |virus, info|
128
- # [virus, info.to_a.sort_by {|gen, len| len}.reverse]
129
- # end
130
- #
131
- # pp hh
132
-
133
- # hh = hit_info.map do |virus, info|
134
- # [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
135
- #
136
- # end
137
- # p hit_info
138
-
139
- scores_files = {}
140
- programs.each do |program|
141
- raw_fname = File.join outdir, "scores_raw.#{program}.txt"
142
- scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
143
-
144
- scores_files[program] = {
145
- raw: File.open(raw_fname, "w"),
146
- scaled: File.open(scaled_fname, "w")
147
- }
148
- end
149
-
150
- scores_files.each do |program, files|
151
- files.each do |name, file|
152
- file.puts %w[virus host score].join "\t"
153
- end
154
- end
155
75
 
156
76
  name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
157
77
  name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
@@ -162,12 +82,38 @@ vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpd
162
82
  # TODO separate the parser from the runner for mummer.
163
83
  host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
164
84
 
85
+ puts "mummer"
86
+ pp host_info_mummer
87
+ puts
88
+
89
+ # TODO separate the parser from the runner for homology
90
+ host_info_homology = BigSimon::Runners.homology tmpdir_virus, tmpdir_host, tmpdir, threads
91
+
92
+ puts "homology"
93
+ pp host_info_homology
94
+ puts
95
+
96
+
97
+
165
98
  host_info_wish = BigSimon::Parsers.wish wish_outf
166
99
  host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
167
100
 
168
- host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
101
+ puts "wish"
102
+ pp host_info_wish
103
+ puts
104
+
105
+ puts "vhm"
106
+ pp host_info_vhm
107
+ puts
108
+
109
+
110
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer, host_info_homology], programs
169
111
  host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
170
112
 
113
+ puts
114
+ pp host_info_simple_names
115
+ puts
116
+
171
117
  puts
172
118
  pp host_info
173
119
  puts
@@ -176,6 +122,25 @@ puts
176
122
  # File.open all_predictions_fname, "w" do |f|
177
123
  # f.puts %w[virus host program score scaled.score].join "\t"
178
124
 
125
+ scores_files = {}
126
+ programs.each do |program|
127
+ raw_fname = File.join outdir, "scores_raw.#{program}.txt"
128
+ scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
129
+
130
+ scores_files[program] = {
131
+ raw: File.open(raw_fname, "w"),
132
+ scaled: File.open(scaled_fname, "w")
133
+ }
134
+ end
135
+
136
+ scores_files.each do |program, files|
137
+ files.each do |name, file|
138
+ puts "LALA 3: #{[program, name, file]} #{%w[virus host score].join}"
139
+
140
+ file.puts %w[virus host score].join "\t"
141
+ end
142
+ end
143
+
179
144
  host_info.each do |virus, h1|
180
145
  h1.each do |host, h2|
181
146
  lines = {}
@@ -183,6 +148,8 @@ host_info.each do |virus, h1|
183
148
  h2[:scores].each do |program, score|
184
149
  lines[[virus, host, program]] = [score]
185
150
 
151
+ puts "LALA: #{[virus, host, score]}"
152
+
186
153
  scores_files[program][:raw].puts [virus, host, score].join "\t"
187
154
  end
188
155
 
@@ -190,6 +157,9 @@ host_info.each do |virus, h1|
190
157
  h2[:scaled_scores].each do |program, score|
191
158
  lines[[virus, host, program]] << score
192
159
 
160
+ puts "LALA 2: #{[virus, host, score]}"
161
+
162
+
193
163
  scores_files[program][:scaled].puts [virus, host, score].join "\t"
194
164
  end
195
165
 
@@ -1,5 +1,6 @@
1
1
  require "tempfile"
2
2
  require "parse_fasta"
3
+ require "parallel"
3
4
 
4
5
  module BigSimon
5
6
  class Runners
@@ -52,12 +53,15 @@ module BigSimon
52
53
  vir_f.fsync
53
54
  host_f.fsync
54
55
 
55
- cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
+ # -k 3 index every third position in reference (broken now, bug in mummer)
57
+ # -n -k 3 -threads 3
58
+ # -n match only A C T G
59
+ cmd = "mummer -n -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
60
  Process.run_and_time_it! "MUMMER", cmd
57
61
  end
58
62
  end
59
63
 
60
- virus = nil
64
+ virus = nil
61
65
  overall_max_score = 0
62
66
  File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
63
67
  line.chomp!
@@ -75,7 +79,7 @@ module BigSimon
75
79
  else
76
80
  ary = line.strip.split " "
77
81
 
78
- host = ary[0].sub(/___reverse$/, "").strip
82
+ host = ary[0].sub(/___reverse$/, "").strip
79
83
  score = ary[3].to_i
80
84
 
81
85
  Rya::AbortIf.assert hit_table[virus].has_key?(host)
@@ -88,7 +92,7 @@ module BigSimon
88
92
  hit_table[virus][host] = score if score > hit_table[virus][host]
89
93
 
90
94
  # Track the overall max for scaling.
91
- overall_max_score = score if score > overall_max_score
95
+ overall_max_score = score if score > overall_max_score
92
96
  end
93
97
  end
94
98
  end
@@ -203,6 +207,121 @@ module BigSimon
203
207
  results
204
208
  end
205
209
 
210
+ # For scoring homology-ness, I just sum the bitscore for all significant hits for all genomes.
211
+ #
212
+ # @note I will make the specified outdir if it doesn't exist.
213
+ # @note Assumes that the files end with *.fa
214
+ # @note Assumes that the file names match the IDs. This SHOULD be taken care of by the big_simon program.
215
+ # @todo assert that fname thing matches sequence ID name.
216
+ def self.homology vir_dir, host_dir, outdir, threads
217
+ FileUtils.mkdir_p outdir
218
+
219
+ host_orfs = File.join outdir, "host_orfs.homology"
220
+ host_orfs_blast_db = host_orfs + ".blast_db.homology"
221
+
222
+ # Call ORFs on Hosts
223
+ cmd = "cat #{host_dir}/*.fa | #{BigSimon::PRODIGAL} " \
224
+ "-d #{host_orfs} " \
225
+ "> /dev/null"
226
+
227
+ Process.run_and_time_it! "Predicting host ORFs", cmd
228
+
229
+ # Make blast db's for the host genes.
230
+ cmd = "#{BigSimon::MAKEBLASTDB} " \
231
+ "-in #{host_orfs} " \
232
+ "-out #{host_orfs_blast_db} " \
233
+ "-dbtype nucl"
234
+
235
+ Process.run_and_time_it! "Making host blast db", cmd
236
+
237
+ vir_genome_fnames = Dir.glob(vir_dir + "/*.fa")
238
+
239
+ blast_info = Parallel.map(vir_genome_fnames, in_processes: threads) do |vir_genome_fname|
240
+ vir_orfs = File.join outdir, File.basename(vir_genome_fname) + ".vir_orfs.homology"
241
+ blast_results = File.join outdir, File.basename(vir_genome_fname) + ".blast_results.homology"
242
+
243
+ # this will be used as a viral ID.
244
+ vir_simple_fname = File.basename vir_genome_fname, ".fa"
245
+ blast_table = {}
246
+ blast_table[vir_simple_fname] = Hash.new 0
247
+
248
+ # Call ORFs on the virus.
249
+ cmd = "#{BigSimon::PRODIGAL} " \
250
+ "-d #{vir_orfs} -p meta -i #{vir_genome_fname} " \
251
+ "> /dev/null"
252
+
253
+ Process.run_and_time_it! "Predicting ORFs for #{File.basename vir_genome_fname}", cmd
254
+
255
+ # Blast the ORFs against genomes.
256
+ cmd = "#{BigSimon::BLASTN} -query #{vir_orfs} -db #{host_orfs_blast_db} -outfmt 6 -evalue 0.01 -word_size 11 -out #{blast_results}"
257
+ Process.run_and_time_it! "Blasting ORFs for #{File.basename vir_genome_fname}", cmd
258
+
259
+ # Remove ORFs file
260
+ FileUtils.rm vir_orfs if File.exist? vir_orfs
261
+
262
+ Rya::AbortIf.logger.info { "Parsing #{blast_results}" }
263
+ # Parse the blast.
264
+
265
+ File.open(blast_results, "rt").each_line do |line|
266
+ ary = line.chomp.split "\t"
267
+
268
+ # The .sub() is to remove the annotation that prodigal gives.
269
+ vir_id = ary[0].sub(/_[0-9]+$/, "")
270
+ host_id = ary[1].sub(/_[0-9]+$/, "")
271
+ score = ary[11].to_f
272
+
273
+ Rya::AbortIf.assert blast_table.has_key?(vir_id), "blast_table: got #{vir_id} should have been #{vir_simple_fname}"
274
+
275
+ blast_table[vir_id][host_id] += score
276
+ end
277
+
278
+ # Remove blast file
279
+ # FileUtils.rm_r blast_results if File.exist? blast_results
280
+
281
+ # Again, we're assuming the input is .fa, which the big_simon program SHOULD ensure. TODO check these things with assertions.
282
+ simple_vir_name = File.basename vir_genome_fname.sub(/.fa$/, "")
283
+
284
+ [simple_vir_name, blast_table]
285
+ end
286
+
287
+ collated_blast_table = {}
288
+ host_simple_names = Dir.glob(host_dir + "/*.fa").map { |fname| File.basename(fname, ".fa") }
289
+
290
+ Rya::AbortIf.assert host_simple_names.length == host_simple_names.uniq.length, "host simple names are not unique"
291
+
292
+ Rya::AbortIf.logger.info { "Collating blast results" }
293
+
294
+ # Get max score
295
+ max_score = -1
296
+ blast_info.each do |_, blast_table|
297
+ blast_table.each do |vir_id, host_scores|
298
+ this_max = host_scores.values.max || -1 # sometimes there are no hits at all
299
+
300
+ max_score = this_max if this_max > max_score
301
+ end
302
+ end
303
+ Rya::AbortIf.assert max_score > -1, "didn't get any scores"
304
+
305
+
306
+ klass = Class.new.extend Rya::CoreExtensions::Math
307
+ blast_info.each do |simple_vir_name, blast_table|
308
+ blast_table.each do |vir_id, host_scores|
309
+ collated_blast_table[vir_id] = []
310
+
311
+ host_simple_names.each do |host_id|
312
+ scaled_score = klass.scale host_scores[host_id], 0, max_score, 1, 0
313
+
314
+ host_table = { host: host_id, score: host_scores[host_id], scaled_score: scaled_score }
315
+ collated_blast_table[vir_id] << host_table
316
+ end
317
+ end
318
+ end
319
+
320
+ pp collated_blast_table
321
+
322
+ collated_blast_table
323
+ end
324
+
206
325
  def self.vir_host_matcher exe, vir_dir, host_dir, outdir
207
326
  FileUtils.mkdir_p outdir
208
327
 
@@ -273,13 +392,13 @@ module BigSimon
273
392
 
274
393
  out_fname = File.join outdir, "#{basename}.heatmap.pdf"
275
394
 
276
- [in_fname, out_fname]
395
+ [File.absolute_path(in_fname), File.absolute_path(out_fname)]
277
396
  end
278
397
 
279
398
 
280
399
  rcode_str = BigSimon::Utils.rcode fnames
281
400
 
282
- Object::Tempfile.open do |f|
401
+ Object::File.open(File.join(outdir, "RCODE.r"), "w") do |f|
283
402
  f.puts rcode_str
284
403
  f.fsync # ensure no data is buffered
285
404
 
@@ -70,6 +70,7 @@ draw.heatmap("#{in_fname}", "#{out_fname}")
70
70
  1 - Math.exp(ll)
71
71
  end
72
72
 
73
+ # @note I also rename all the sequences in the tmp fasta files with the new ID.
73
74
  def self.set_up_tmp_dirs fastas, tmpdir, which
74
75
  Object::FileUtils.mkdir_p tmpdir
75
76
 
@@ -92,7 +93,7 @@ draw.heatmap("#{in_fname}", "#{out_fname}")
92
93
  outfname = File.join tmpdir, "#{new_id}.fa"
93
94
 
94
95
  File.open(outfname, "w") do |f|
95
- f.puts rec
96
+ f.puts ">#{new_id}\n#{rec.seq}" # TODO HERE
96
97
  end
97
98
  end
98
99
  end
@@ -1,5 +1,5 @@
1
1
  module BigSimon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
 
4
4
  COPYRIGHT = "2018 Ryan Moore"
5
5
  CONTACT = "moorer@udel.edu"
data/lib/big_simon.rb CHANGED
@@ -20,8 +20,14 @@ module BigSimon
20
20
  BIN = File.join ROOT, "vendor", "bin", "mac"
21
21
  SPEC = File.join ROOT, "spec"
22
22
  TEST_FILES = File.join SPEC, "test_files"
23
+
24
+ # Programs
23
25
  WISH = File.join BIN, "WIsH"
24
26
  VHM = File.join BIN, "vhm.py"
25
27
  MUMMER = File.join BIN, "mummer"
26
28
  RSCRIPT = "Rscript"
29
+
30
+ BLASTN = File.join BIN, "blastn"
31
+ MAKEBLASTDB = File.join BIN, "makeblastdb"
32
+ PRODIGAL = File.join BIN, "prodigal"
27
33
  end
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-30 00:00:00.000000000 Z
11
+ date: 2018-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: parallel
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: parse_fasta
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -137,10 +151,13 @@ files:
137
151
  - lib/big_simon/utils.rb
138
152
  - lib/big_simon/version.rb
139
153
  - vendor/bin/mac/WIsH
154
+ - vendor/bin/mac/blastn
140
155
  - vendor/bin/mac/computeMeasure.out
141
156
  - vendor/bin/mac/computeMeasure_onlyd2star.out
142
157
  - vendor/bin/mac/countKmer.out
158
+ - vendor/bin/mac/makeblastdb
143
159
  - vendor/bin/mac/mummer
160
+ - vendor/bin/mac/prodigal
144
161
  - vendor/bin/mac/vhm.py
145
162
  homepage: https://github.com/mooreryan/big_simon
146
163
  licenses: []