big_simon 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
4
- data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
3
+ metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
4
+ data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
5
5
  SHA512:
6
- metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
7
- data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
6
+ metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
7
+ data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.1.0)
4
+ big_simon (0.1.1)
5
5
  parse_fasta (~> 2.5, >= 2.5.2)
6
6
  rya (~> 0.4.0)
7
7
  trollop (~> 2.1, >= 2.1.3)
data/Makefile CHANGED
@@ -1,8 +1,8 @@
1
1
  test_small:
2
- rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
2
+ rm -r 0000TEST/; time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
3
3
 
4
4
  test_small_install:
5
- rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
5
+ rm -r 0000TEST/; rake install && time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
6
6
 
7
7
  test_toy:
8
8
  rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
data/exe/big_simon CHANGED
@@ -71,7 +71,7 @@ tmpdir_host = File.join tmpdir, "host"
71
71
  # all_predictions_fname = File.join outdir, "scores_all.txt"
72
72
  mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
73
 
74
- virus_recs, host_recs = [], []
74
+ # virus_recs, host_recs = [], []
75
75
 
76
76
  # Tempfile.open do |vir_f|
77
77
  # Tempfile.open do |host_f|
@@ -159,38 +159,18 @@ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmp
159
159
  wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
160
160
  vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
161
161
 
162
+ # TODO separate the parser from the runner for mummer.
162
163
  host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
163
164
 
164
- # Map them back to simple names. TODO just have it spit these out from the beginning.
165
- host_info_mummer_simple_names = {}
166
- inverted_name_map_virus = name_map_virus.invert
167
- inverted_name_map_host = name_map_host.invert
168
-
169
- host_info_mummer.each do |virus, host_tables|
170
- virname = virus
171
- if inverted_name_map_virus.has_key? virus
172
- virname = inverted_name_map_virus[virus]
173
- end
174
-
175
- host_info_mummer_simple_names[virname] = []
176
-
177
- host_tables.map do |table|
178
- hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
179
- new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
180
-
181
- host_info_mummer_simple_names[virname] << new_table
182
- end
183
- end
184
-
185
165
  host_info_wish = BigSimon::Parsers.wish wish_outf
186
166
  host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
187
167
 
188
- host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
168
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
189
169
  host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
190
170
 
191
- # puts
192
- # pp host_info
193
- # puts
171
+ puts
172
+ pp host_info
173
+ puts
194
174
 
195
175
  # Just a basic all info file
196
176
  # File.open all_predictions_fname, "w" do |f|
@@ -3,10 +3,15 @@ Signal.trap("PIPE", "EXIT")
3
3
 
4
4
  require "pp"
5
5
 
6
+ require "big_simon"
7
+
8
+ Rya::AbortIf.abort_unless ARGV.count >= 1, "usage: TOP=number consensus_predictions scores_scaled.*.txt > consensus_predictions.txt"
9
+
6
10
  by_program = {}
7
11
  lines = []
8
12
 
9
- TOP = (ENV["TOP"] || 5).to_i
13
+ TOP = (ENV["TOP"] || 3).to_i
14
+ AT_LEAST = (ENV["AT_LEAST"] || 2).to_i
10
15
 
11
16
  ARGV.each do |fname|
12
17
  scores = {}
@@ -78,16 +83,18 @@ by_program.each do |virus, program_tables|
78
83
  end
79
84
  end
80
85
 
81
- first_table.each do |virus, host_counts|
82
- host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
83
- STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
84
- end
85
- STDERR.puts
86
- end
86
+ # first_table.each do |virus, host_counts|
87
+ # host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
88
+ # STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
89
+ # end
90
+ # STDERR.puts
91
+ # end
87
92
 
88
93
  top_host_table.each do |virus, host_counts|
94
+ puts "\n\n>Virus-------#{virus}"
89
95
  host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
96
+ if programs.count >= AT_LEAST
90
97
  puts [virus, :top_N, host, programs.count, programs].join "\t"
98
+ end
91
99
  end
92
- puts
93
100
  end
@@ -1,11 +1,118 @@
1
1
  require "tempfile"
2
+ require "parse_fasta"
2
3
 
3
4
  module BigSimon
4
5
  class Runners
5
6
 
7
+ # @note To match the other things, you'd like them to be key'd on the file name.
8
+ def self.mummer exe, vir_dir, host_dir, outdir, threads
9
+ klass = Class.new.extend Rya::CoreExtensions::Math
10
+ FileUtils.mkdir_p outdir
11
+
12
+ mummer_outfname = File.join outdir, "mummer_out.txt"
13
+
14
+ virus_fnames = Dir.glob(vir_dir + "/*")
15
+ host_fnames = Dir.glob(host_dir + "/*")
16
+
17
+ hit_table = {}
18
+
19
+ Tempfile.open do |vir_f|
20
+ Tempfile.open do |host_f|
21
+ virus_fnames.each do |fname|
22
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
23
+
24
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
25
+ # id needs to be the file name
26
+ new_id = File.basename fname.sub(/.fa$/, "")
27
+
28
+ hit_table[new_id] = {}
29
+
30
+ vir_f.puts ">#{new_id}\n#{rec.seq}"
31
+
32
+ vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
33
+ end
34
+ end
35
+
36
+ host_fnames.each do |fname|
37
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
38
+
39
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
40
+ new_id = File.basename fname.sub(/.fa$/, "")
41
+
42
+ # Add this host to each virus in the hit_table
43
+ hit_table.each do |virus, host_table|
44
+ host_table[new_id] = 0 # set it to defualt score of 0
45
+ end
46
+
47
+ host_f.puts ">#{new_id}\n#{rec.seq}"
48
+ host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
49
+ end
50
+ end
51
+
52
+ vir_f.fsync
53
+ host_f.fsync
54
+
55
+ cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
+ Process.run_and_time_it! "MUMMER", cmd
57
+ end
58
+ end
59
+
60
+ virus = nil
61
+ overall_max_score = 0
62
+ File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
63
+ line.chomp!
64
+
65
+ unless line.empty?
66
+ if line.start_with? ">"
67
+ virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
68
+
69
+ # It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)
70
+
71
+ Rya::AbortIf.assert hit_table.has_key?(virus)
72
+ # unless hit_table.has_key? virus
73
+ # hit_table[virus] = {}
74
+ # end
75
+ else
76
+ ary = line.strip.split " "
77
+
78
+ host = ary[0].sub(/___reverse$/, "").strip
79
+ score = ary[3].to_i
80
+
81
+ Rya::AbortIf.assert hit_table[virus].has_key?(host)
82
+
83
+ # unless hit_table[virus].has_key? host
84
+ # hit_table[virus][host] = -1
85
+ # end
86
+
87
+ # We only want the longest hit.
88
+ hit_table[virus][host] = score if score > hit_table[virus][host]
89
+
90
+ # Track the overall max for scaling.
91
+ overall_max_score = score if score > overall_max_score
92
+ end
93
+ end
94
+ end
95
+
96
+ results_table = {}
97
+
98
+ min, max, from, to = 0, overall_max_score, 1, 0
99
+
100
+ hit_table.each do |virus, host_table|
101
+ results_table[virus] = []
102
+
103
+ host_table.each do |host, score|
104
+ scaled_score = klass.scale score, min, max, from, to
105
+
106
+ results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
107
+ end
108
+ end
109
+
110
+ results_table
111
+ end
112
+
6
113
  # This one's a bit different as it parses as well and returns original names.
7
114
  # @todo Also do the reverse of each genome in case it's a contig.
8
- def self.mummer exe, vir_dir, host_dir, outdir, threads
115
+ def self.mummer2 exe, vir_dir, host_dir, outdir, threads
9
116
  klass = Class.new.extend Rya::CoreExtensions::Math
10
117
  FileUtils.mkdir_p outdir
11
118
 
@@ -1,5 +1,5 @@
1
1
  module BigSimon
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
 
4
4
  COPYRIGHT = "2018 Ryan Moore"
5
5
  CONTACT = "moorer@udel.edu"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -111,7 +111,7 @@ email:
111
111
  - moorer@udel.edu
112
112
  executables:
113
113
  - big_simon
114
- - ranks
114
+ - consensus_predictions
115
115
  extensions: []
116
116
  extra_rdoc_files: []
117
117
  files:
@@ -129,7 +129,7 @@ files:
129
129
  - bin/console
130
130
  - bin/setup
131
131
  - exe/big_simon
132
- - exe/ranks
132
+ - exe/consensus_predictions
133
133
  - lib/big_simon.rb
134
134
  - lib/big_simon/parsers.rb
135
135
  - lib/big_simon/pipeline.rb