big_simon 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
4
- data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
3
+ metadata.gz: 95ca94aff107e2545dd26ab671c2ed125cb89164
4
+ data.tar.gz: 8cb9098b4462d6702ecd692e7261abb9c1c7c26a
5
5
  SHA512:
6
- metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
7
- data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
6
+ metadata.gz: be890e07e1ca248da8e5b80637b25d00c20185d4710229cc9c72d564abc930d4eb734ad53f42842a479827e64205ee1f53f5691b6527d03b9cc41b9b8656ae85
7
+ data.tar.gz: ac30f597de91303e3b8040ecfde48f98d56c343aadfa3a1085d77fbb55692fb055569734470812144d716e73ee8c94265c732a07462bd8490c2b937572961062
data/.gitignore CHANGED
@@ -30,4 +30,12 @@ spec/test_files/outdir_for_heatmaps/outdir/
30
30
 
31
31
  stats.tmp.r
32
32
 
33
- scratch
33
+ scratch
34
+
35
+
36
+
37
+ spec/test_files/homology_files/output/*homology*
38
+
39
+ pec/test_files/mummer_files/output/mummer_out.txt
40
+
41
+ tmp.prodigal.stdin.5129
data/Gemfile.lock CHANGED
@@ -1,7 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.1.1)
4
+ big_simon (0.2.0)
5
+ parallel
5
6
  parse_fasta (~> 2.5, >= 2.5.2)
6
7
  rya (~> 0.4.0)
7
8
  trollop (~> 2.1, >= 2.1.3)
@@ -11,6 +12,7 @@ GEM
11
12
  specs:
12
13
  abort_if (0.2.0)
13
14
  diff-lcs (1.3)
15
+ parallel (1.12.1)
14
16
  parse_fasta (2.5.2)
15
17
  rake (10.5.0)
16
18
  rspec (3.7.0)
data/big_simon.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
28
 
29
+ spec.add_runtime_dependency "parallel"
29
30
  spec.add_runtime_dependency "parse_fasta", "~> 2.5", ">= 2.5.2"
30
31
  spec.add_runtime_dependency "rya", "~> 0.4.0"
31
32
  spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.3"
data/exe/big_simon CHANGED
@@ -54,7 +54,8 @@ Rya::AbortIf.abort_unless opts[:threads] > 0,
54
54
  programs = [
55
55
  "WIsH",
56
56
  "VirHostMatcher",
57
- "mummer"
57
+ "mummer",
58
+ "homology",
58
59
  ]
59
60
 
60
61
  outdir = opts[:outdir]
@@ -71,87 +72,6 @@ tmpdir_host = File.join tmpdir, "host"
71
72
  # all_predictions_fname = File.join outdir, "scores_all.txt"
72
73
  mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
74
 
74
- # virus_recs, host_recs = [], []
75
-
76
- # Tempfile.open do |vir_f|
77
- # Tempfile.open do |host_f|
78
- # virus_fnames.each do |fname|
79
- # ParseFasta::SeqFile.open(fname).each_record do |rec|
80
- # vir_f.puts rec
81
- #
82
- # vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
83
- # end
84
- # end
85
- #
86
- # host_fnames.each do |fname|
87
- # ParseFasta::SeqFile.open(fname).each_record do |rec|
88
- # host_f.puts rec
89
- # host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}"
90
- # end
91
- # end
92
- #
93
- # vir_f.fsync
94
- # host_f.fsync
95
- #
96
- # cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT"
97
- # Process.run_and_time_it! "MUMMER", cmd
98
- # end
99
- # end
100
- #
101
- # header = nil
102
- # hits = []
103
- # hit_info = {}
104
- # virus = nil
105
- #
106
- # File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx|
107
- # if line.start_with? '>'
108
- # virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
109
- #
110
- # unless hit_info.has_key? virus
111
- # hit_info[virus] = {}
112
- # end
113
- # else
114
- # host, _, _, len = line.chomp.strip.split(" ")
115
- # host = host.sub(/___reverse$/, "").strip
116
- #
117
- # unless hit_info[virus].has_key? host
118
- # hit_info[virus][host] = -1
119
- # end
120
- #
121
- # hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host]
122
- # end
123
- # end
124
- #
125
- # puts
126
- #
127
- # hh = hit_info.map do |virus, info|
128
- # [virus, info.to_a.sort_by {|gen, len| len}.reverse]
129
- # end
130
- #
131
- # pp hh
132
-
133
- # hh = hit_info.map do |virus, info|
134
- # [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse
135
- #
136
- # end
137
- # p hit_info
138
-
139
- scores_files = {}
140
- programs.each do |program|
141
- raw_fname = File.join outdir, "scores_raw.#{program}.txt"
142
- scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
143
-
144
- scores_files[program] = {
145
- raw: File.open(raw_fname, "w"),
146
- scaled: File.open(scaled_fname, "w")
147
- }
148
- end
149
-
150
- scores_files.each do |program, files|
151
- files.each do |name, file|
152
- file.puts %w[virus host score].join "\t"
153
- end
154
- end
155
75
 
156
76
  name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus"
157
77
  name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host"
@@ -162,12 +82,38 @@ vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpd
162
82
  # TODO separate the parser from the runner for mummer.
163
83
  host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
164
84
 
85
+ puts "mummer"
86
+ pp host_info_mummer
87
+ puts
88
+
89
+ # TODO separate the parser from the runner for homology
90
+ host_info_homology = BigSimon::Runners.homology tmpdir_virus, tmpdir_host, tmpdir, threads
91
+
92
+ puts "homology"
93
+ pp host_info_homology
94
+ puts
95
+
96
+
97
+
165
98
  host_info_wish = BigSimon::Parsers.wish wish_outf
166
99
  host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
167
100
 
168
- host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
101
+ puts "wish"
102
+ pp host_info_wish
103
+ puts
104
+
105
+ puts "vhm"
106
+ pp host_info_vhm
107
+ puts
108
+
109
+
110
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer, host_info_homology], programs
169
111
  host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
170
112
 
113
+ puts
114
+ pp host_info_simple_names
115
+ puts
116
+
171
117
  puts
172
118
  pp host_info
173
119
  puts
@@ -176,6 +122,25 @@ puts
176
122
  # File.open all_predictions_fname, "w" do |f|
177
123
  # f.puts %w[virus host program score scaled.score].join "\t"
178
124
 
125
+ scores_files = {}
126
+ programs.each do |program|
127
+ raw_fname = File.join outdir, "scores_raw.#{program}.txt"
128
+ scaled_fname = File.join outdir, "scores_scaled.#{program}.txt"
129
+
130
+ scores_files[program] = {
131
+ raw: File.open(raw_fname, "w"),
132
+ scaled: File.open(scaled_fname, "w")
133
+ }
134
+ end
135
+
136
+ scores_files.each do |program, files|
137
+ files.each do |name, file|
138
+ puts "LALA 3: #{[program, name, file]} #{%w[virus host score].join}"
139
+
140
+ file.puts %w[virus host score].join "\t"
141
+ end
142
+ end
143
+
179
144
  host_info.each do |virus, h1|
180
145
  h1.each do |host, h2|
181
146
  lines = {}
@@ -183,6 +148,8 @@ host_info.each do |virus, h1|
183
148
  h2[:scores].each do |program, score|
184
149
  lines[[virus, host, program]] = [score]
185
150
 
151
+ puts "LALA: #{[virus, host, score]}"
152
+
186
153
  scores_files[program][:raw].puts [virus, host, score].join "\t"
187
154
  end
188
155
 
@@ -190,6 +157,9 @@ host_info.each do |virus, h1|
190
157
  h2[:scaled_scores].each do |program, score|
191
158
  lines[[virus, host, program]] << score
192
159
 
160
+ puts "LALA 2: #{[virus, host, score]}"
161
+
162
+
193
163
  scores_files[program][:scaled].puts [virus, host, score].join "\t"
194
164
  end
195
165
 
@@ -1,5 +1,6 @@
1
1
  require "tempfile"
2
2
  require "parse_fasta"
3
+ require "parallel"
3
4
 
4
5
  module BigSimon
5
6
  class Runners
@@ -52,12 +53,15 @@ module BigSimon
52
53
  vir_f.fsync
53
54
  host_f.fsync
54
55
 
55
- cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
+ # -k 3 index every third position in reference (broken now, bug in mummer)
57
+ # -n -k 3 -threads 3
58
+ # -n match only A C T G
59
+ cmd = "mummer -n -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
60
  Process.run_and_time_it! "MUMMER", cmd
57
61
  end
58
62
  end
59
63
 
60
- virus = nil
64
+ virus = nil
61
65
  overall_max_score = 0
62
66
  File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
63
67
  line.chomp!
@@ -75,7 +79,7 @@ module BigSimon
75
79
  else
76
80
  ary = line.strip.split " "
77
81
 
78
- host = ary[0].sub(/___reverse$/, "").strip
82
+ host = ary[0].sub(/___reverse$/, "").strip
79
83
  score = ary[3].to_i
80
84
 
81
85
  Rya::AbortIf.assert hit_table[virus].has_key?(host)
@@ -88,7 +92,7 @@ module BigSimon
88
92
  hit_table[virus][host] = score if score > hit_table[virus][host]
89
93
 
90
94
  # Track the overall max for scaling.
91
- overall_max_score = score if score > overall_max_score
95
+ overall_max_score = score if score > overall_max_score
92
96
  end
93
97
  end
94
98
  end
@@ -203,6 +207,121 @@ module BigSimon
203
207
  results
204
208
  end
205
209
 
210
+ # For scoring homology-ness, I just sum the bitscore for all significant hits for all genomes.
211
+ #
212
+ # @note I will make the specified outdir if it doesn't exist.
213
+ # @note Assumes that the files end with *.fa
214
+ # @note Assumes that the file names match the IDs. This SHOULD be taken care of by the big_simon program.
215
+ # @todo assert that fname thing matches sequence ID name.
216
+ def self.homology vir_dir, host_dir, outdir, threads
217
+ FileUtils.mkdir_p outdir
218
+
219
+ host_orfs = File.join outdir, "host_orfs.homology"
220
+ host_orfs_blast_db = host_orfs + ".blast_db.homology"
221
+
222
+ # Call ORFs on Hosts
223
+ cmd = "cat #{host_dir}/*.fa | #{BigSimon::PRODIGAL} " \
224
+ "-d #{host_orfs} " \
225
+ "> /dev/null"
226
+
227
+ Process.run_and_time_it! "Predicting host ORFs", cmd
228
+
229
+ # Make blast db's for the host genes.
230
+ cmd = "#{BigSimon::MAKEBLASTDB} " \
231
+ "-in #{host_orfs} " \
232
+ "-out #{host_orfs_blast_db} " \
233
+ "-dbtype nucl"
234
+
235
+ Process.run_and_time_it! "Making host blast db", cmd
236
+
237
+ vir_genome_fnames = Dir.glob(vir_dir + "/*.fa")
238
+
239
+ blast_info = Parallel.map(vir_genome_fnames, in_processes: threads) do |vir_genome_fname|
240
+ vir_orfs = File.join outdir, File.basename(vir_genome_fname) + ".vir_orfs.homology"
241
+ blast_results = File.join outdir, File.basename(vir_genome_fname) + ".blast_results.homology"
242
+
243
+ # this will be used as a viral ID.
244
+ vir_simple_fname = File.basename vir_genome_fname, ".fa"
245
+ blast_table = {}
246
+ blast_table[vir_simple_fname] = Hash.new 0
247
+
248
+ # Call ORFs on the virus.
249
+ cmd = "#{BigSimon::PRODIGAL} " \
250
+ "-d #{vir_orfs} -p meta -i #{vir_genome_fname} " \
251
+ "> /dev/null"
252
+
253
+ Process.run_and_time_it! "Predicting ORFs for #{File.basename vir_genome_fname}", cmd
254
+
255
+ # Blast the ORFs against genomes.
256
+ cmd = "#{BigSimon::BLASTN} -query #{vir_orfs} -db #{host_orfs_blast_db} -outfmt 6 -evalue 0.01 -word_size 11 -out #{blast_results}"
257
+ Process.run_and_time_it! "Blasting ORFs for #{File.basename vir_genome_fname}", cmd
258
+
259
+ # Remove ORFs file
260
+ FileUtils.rm vir_orfs if File.exist? vir_orfs
261
+
262
+ Rya::AbortIf.logger.info { "Parsing #{blast_results}" }
263
+ # Parse the blast.
264
+
265
+ File.open(blast_results, "rt").each_line do |line|
266
+ ary = line.chomp.split "\t"
267
+
268
+ # The .sub() is to remove the annotation that prodigal gives.
269
+ vir_id = ary[0].sub(/_[0-9]+$/, "")
270
+ host_id = ary[1].sub(/_[0-9]+$/, "")
271
+ score = ary[11].to_f
272
+
273
+ Rya::AbortIf.assert blast_table.has_key?(vir_id), "blast_table: got #{vir_id} should have been #{vir_simple_fname}"
274
+
275
+ blast_table[vir_id][host_id] += score
276
+ end
277
+
278
+ # Remove blast file
279
+ # FileUtils.rm_r blast_results if File.exist? blast_results
280
+
281
+ # Again, we're assuming the input is .fa, which the big_simon program SHOULD ensure. TODO check these things with assertions.
282
+ simple_vir_name = File.basename vir_genome_fname.sub(/.fa$/, "")
283
+
284
+ [simple_vir_name, blast_table]
285
+ end
286
+
287
+ collated_blast_table = {}
288
+ host_simple_names = Dir.glob(host_dir + "/*.fa").map { |fname| File.basename(fname, ".fa") }
289
+
290
+ Rya::AbortIf.assert host_simple_names.length == host_simple_names.uniq.length, "host simple names are not unique"
291
+
292
+ Rya::AbortIf.logger.info { "Collating blast results" }
293
+
294
+ # Get max score
295
+ max_score = -1
296
+ blast_info.each do |_, blast_table|
297
+ blast_table.each do |vir_id, host_scores|
298
+ this_max = host_scores.values.max || -1 # sometimes there are no hits at all
299
+
300
+ max_score = this_max if this_max > max_score
301
+ end
302
+ end
303
+ Rya::AbortIf.assert max_score > -1, "didn't get any scores"
304
+
305
+
306
+ klass = Class.new.extend Rya::CoreExtensions::Math
307
+ blast_info.each do |simple_vir_name, blast_table|
308
+ blast_table.each do |vir_id, host_scores|
309
+ collated_blast_table[vir_id] = []
310
+
311
+ host_simple_names.each do |host_id|
312
+ scaled_score = klass.scale host_scores[host_id], 0, max_score, 1, 0
313
+
314
+ host_table = { host: host_id, score: host_scores[host_id], scaled_score: scaled_score }
315
+ collated_blast_table[vir_id] << host_table
316
+ end
317
+ end
318
+ end
319
+
320
+ pp collated_blast_table
321
+
322
+ collated_blast_table
323
+ end
324
+
206
325
  def self.vir_host_matcher exe, vir_dir, host_dir, outdir
207
326
  FileUtils.mkdir_p outdir
208
327
 
@@ -273,13 +392,13 @@ module BigSimon
273
392
 
274
393
  out_fname = File.join outdir, "#{basename}.heatmap.pdf"
275
394
 
276
- [in_fname, out_fname]
395
+ [File.absolute_path(in_fname), File.absolute_path(out_fname)]
277
396
  end
278
397
 
279
398
 
280
399
  rcode_str = BigSimon::Utils.rcode fnames
281
400
 
282
- Object::Tempfile.open do |f|
401
+ Object::File.open(File.join(outdir, "RCODE.r"), "w") do |f|
283
402
  f.puts rcode_str
284
403
  f.fsync # ensure no data is buffered
285
404
 
@@ -70,6 +70,7 @@ draw.heatmap("#{in_fname}", "#{out_fname}")
70
70
  1 - Math.exp(ll)
71
71
  end
72
72
 
73
+ # @note I also rename all the sequences in the tmp fasta files with the new ID.
73
74
  def self.set_up_tmp_dirs fastas, tmpdir, which
74
75
  Object::FileUtils.mkdir_p tmpdir
75
76
 
@@ -92,7 +93,7 @@ draw.heatmap("#{in_fname}", "#{out_fname}")
92
93
  outfname = File.join tmpdir, "#{new_id}.fa"
93
94
 
94
95
  File.open(outfname, "w") do |f|
95
- f.puts rec
96
+ f.puts ">#{new_id}\n#{rec.seq}" # TODO HERE
96
97
  end
97
98
  end
98
99
  end
@@ -1,5 +1,5 @@
1
1
  module BigSimon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
 
4
4
  COPYRIGHT = "2018 Ryan Moore"
5
5
  CONTACT = "moorer@udel.edu"
data/lib/big_simon.rb CHANGED
@@ -20,8 +20,14 @@ module BigSimon
20
20
  BIN = File.join ROOT, "vendor", "bin", "mac"
21
21
  SPEC = File.join ROOT, "spec"
22
22
  TEST_FILES = File.join SPEC, "test_files"
23
+
24
+ # Programs
23
25
  WISH = File.join BIN, "WIsH"
24
26
  VHM = File.join BIN, "vhm.py"
25
27
  MUMMER = File.join BIN, "mummer"
26
28
  RSCRIPT = "Rscript"
29
+
30
+ BLASTN = File.join BIN, "blastn"
31
+ MAKEBLASTDB = File.join BIN, "makeblastdb"
32
+ PRODIGAL = File.join BIN, "prodigal"
27
33
  end
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-30 00:00:00.000000000 Z
11
+ date: 2018-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: parallel
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: parse_fasta
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -137,10 +151,13 @@ files:
137
151
  - lib/big_simon/utils.rb
138
152
  - lib/big_simon/version.rb
139
153
  - vendor/bin/mac/WIsH
154
+ - vendor/bin/mac/blastn
140
155
  - vendor/bin/mac/computeMeasure.out
141
156
  - vendor/bin/mac/computeMeasure_onlyd2star.out
142
157
  - vendor/bin/mac/countKmer.out
158
+ - vendor/bin/mac/makeblastdb
143
159
  - vendor/bin/mac/mummer
160
+ - vendor/bin/mac/prodigal
144
161
  - vendor/bin/mac/vhm.py
145
162
  homepage: https://github.com/mooreryan/big_simon
146
163
  licenses: []