big_simon 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dfbea3a58014cbec45a3959ad076fd842c6b393c
4
- data.tar.gz: 69d9f8ac1dea196f64dd13fb06a13a4c26b38e45
3
+ metadata.gz: 9c81d1057b304f170cdf1dd6a7551bd981efc232
4
+ data.tar.gz: c0550efe308bc0080c9f14ce65057b100ff2f32b
5
5
  SHA512:
6
- metadata.gz: 3a54fe903bb5c0f2f574a389dfc945ffe3d93bd2b1a15361bda81430542c2ef3990ac31e67b671690e056e23544e4f30f21dbecd756050dfb98dfb924fd15ca3
7
- data.tar.gz: b61f6ba10b7efc267419ccccaef82654662923629e0b2b61e5a3c5e74b79fe8b753dd160c3e3d7fd57b1e926b54684f9367a839075544c496ff3a839ece57fb5
6
+ metadata.gz: 652abc1ab4507ed51012cde1a541a8e5ce75e30a2c2d8b9ef4cb0363e49bb8ea4f088dcb4786bb5f9f466f8f0ed0d2f2e6d9b446aa3a4c8fd2a6a8f4370f6a78
7
+ data.tar.gz: 3a82d66018a1dea01afb9f0230a9f8ad6ac90d96d5862a79b57ecb03f954eedf9183c2dc9d03b6f98a6a0675b3293577c4e5001fa7f792bbe68ebefe397d47f8
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- big_simon (0.1.0)
4
+ big_simon (0.1.1)
5
5
  parse_fasta (~> 2.5, >= 2.5.2)
6
6
  rya (~> 0.4.0)
7
7
  trollop (~> 2.1, >= 2.1.3)
data/Makefile CHANGED
@@ -1,8 +1,8 @@
1
1
  test_small:
2
- rm -r 0000TEST/; exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
2
+ rm -r 0000TEST/; time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
3
3
 
4
4
  test_small_install:
5
- rm -r 0000TEST/; rake install && exe/big_simon -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
5
+ rm -r 0000TEST/; rake install && time exe/big_simon -t 3 -v spec/test_files/virus/* -h spec/test_files/host/* -o 0000TEST && tree 0000TEST
6
6
 
7
7
  test_toy:
8
8
  rm -r toyexample_out; time exe/big_simon -v vendor/repos/VirHostMatcher/test/toyexample/virus/* -h vendor/repos/VirHostMatcher/test/toyexample/host/* -o toyexample_out -t 3
data/exe/big_simon CHANGED
@@ -71,7 +71,7 @@ tmpdir_host = File.join tmpdir, "host"
71
71
  # all_predictions_fname = File.join outdir, "scores_all.txt"
72
72
  mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt"
73
73
 
74
- virus_recs, host_recs = [], []
74
+ # virus_recs, host_recs = [], []
75
75
 
76
76
  # Tempfile.open do |vir_f|
77
77
  # Tempfile.open do |host_f|
@@ -159,38 +159,18 @@ name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmp
159
159
  wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads
160
160
  vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir
161
161
 
162
+ # TODO separate the parser from the runner for mummer.
162
163
  host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads
163
164
 
164
- # Map them back to simple names. TODO just have it spit these out from the beginning.
165
- host_info_mummer_simple_names = {}
166
- inverted_name_map_virus = name_map_virus.invert
167
- inverted_name_map_host = name_map_host.invert
168
-
169
- host_info_mummer.each do |virus, host_tables|
170
- virname = virus
171
- if inverted_name_map_virus.has_key? virus
172
- virname = inverted_name_map_virus[virus]
173
- end
174
-
175
- host_info_mummer_simple_names[virname] = []
176
-
177
- host_tables.map do |table|
178
- hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host]
179
- new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] }
180
-
181
- host_info_mummer_simple_names[virname] << new_table
182
- end
183
- end
184
-
185
165
  host_info_wish = BigSimon::Parsers.wish wish_outf
186
166
  host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf
187
167
 
188
- host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs
168
+ host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer], programs
189
169
  host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host
190
170
 
191
- # puts
192
- # pp host_info
193
- # puts
171
+ puts
172
+ pp host_info
173
+ puts
194
174
 
195
175
  # Just a basic all info file
196
176
  # File.open all_predictions_fname, "w" do |f|
@@ -3,10 +3,15 @@ Signal.trap("PIPE", "EXIT")
3
3
 
4
4
  require "pp"
5
5
 
6
+ require "big_simon"
7
+
8
+ Rya::AbortIf.abort_unless ARGV.count >= 1, "usage: TOP=number consensus_predictions scores_scaled.*.txt > consensus_predictions.txt"
9
+
6
10
  by_program = {}
7
11
  lines = []
8
12
 
9
- TOP = (ENV["TOP"] || 5).to_i
13
+ TOP = (ENV["TOP"] || 3).to_i
14
+ AT_LEAST = (ENV["AT_LEAST"] || 2).to_i
10
15
 
11
16
  ARGV.each do |fname|
12
17
  scores = {}
@@ -78,16 +83,18 @@ by_program.each do |virus, program_tables|
78
83
  end
79
84
  end
80
85
 
81
- first_table.each do |virus, host_counts|
82
- host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
83
- STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
84
- end
85
- STDERR.puts
86
- end
86
+ # first_table.each do |virus, host_counts|
87
+ # host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
88
+ # STDERR.puts [virus, :best, host, programs.count, programs].join "\t"
89
+ # end
90
+ # STDERR.puts
91
+ # end
87
92
 
88
93
  top_host_table.each do |virus, host_counts|
94
+ puts "\n\n>Virus-------#{virus}"
89
95
  host_counts.sort_by { |_, programs| programs.count }.reverse.each do |host, programs|
96
+ if programs.count >= AT_LEAST
90
97
  puts [virus, :top_N, host, programs.count, programs].join "\t"
98
+ end
91
99
  end
92
- puts
93
100
  end
@@ -1,11 +1,118 @@
1
1
  require "tempfile"
2
+ require "parse_fasta"
2
3
 
3
4
  module BigSimon
4
5
  class Runners
5
6
 
7
+ # @note To match the other things, you'd like them to be key'd on the file name.
8
+ def self.mummer exe, vir_dir, host_dir, outdir, threads
9
+ klass = Class.new.extend Rya::CoreExtensions::Math
10
+ FileUtils.mkdir_p outdir
11
+
12
+ mummer_outfname = File.join outdir, "mummer_out.txt"
13
+
14
+ virus_fnames = Dir.glob(vir_dir + "/*")
15
+ host_fnames = Dir.glob(host_dir + "/*")
16
+
17
+ hit_table = {}
18
+
19
+ Tempfile.open do |vir_f|
20
+ Tempfile.open do |host_f|
21
+ virus_fnames.each do |fname|
22
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
23
+
24
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
25
+ # id needs to be the file name
26
+ new_id = File.basename fname.sub(/.fa$/, "")
27
+
28
+ hit_table[new_id] = {}
29
+
30
+ vir_f.puts ">#{new_id}\n#{rec.seq}"
31
+
32
+ vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
33
+ end
34
+ end
35
+
36
+ host_fnames.each do |fname|
37
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
38
+
39
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
40
+ new_id = File.basename fname.sub(/.fa$/, "")
41
+
42
+ # Add this host to each virus in the hit_table
43
+ hit_table.each do |virus, host_table|
44
+ host_table[new_id] = 0 # set it to defualt score of 0
45
+ end
46
+
47
+ host_f.puts ">#{new_id}\n#{rec.seq}"
48
+ host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
49
+ end
50
+ end
51
+
52
+ vir_f.fsync
53
+ host_f.fsync
54
+
55
+ cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
56
+ Process.run_and_time_it! "MUMMER", cmd
57
+ end
58
+ end
59
+
60
+ virus = nil
61
+ overall_max_score = 0
62
+ File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
63
+ line.chomp!
64
+
65
+ unless line.empty?
66
+ if line.start_with? ">"
67
+ virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
68
+
69
+ # It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)
70
+
71
+ Rya::AbortIf.assert hit_table.has_key?(virus)
72
+ # unless hit_table.has_key? virus
73
+ # hit_table[virus] = {}
74
+ # end
75
+ else
76
+ ary = line.strip.split " "
77
+
78
+ host = ary[0].sub(/___reverse$/, "").strip
79
+ score = ary[3].to_i
80
+
81
+ Rya::AbortIf.assert hit_table[virus].has_key?(host)
82
+
83
+ # unless hit_table[virus].has_key? host
84
+ # hit_table[virus][host] = -1
85
+ # end
86
+
87
+ # We only want the longest hit.
88
+ hit_table[virus][host] = score if score > hit_table[virus][host]
89
+
90
+ # Track the overall max for scaling.
91
+ overall_max_score = score if score > overall_max_score
92
+ end
93
+ end
94
+ end
95
+
96
+ results_table = {}
97
+
98
+ min, max, from, to = 0, overall_max_score, 1, 0
99
+
100
+ hit_table.each do |virus, host_table|
101
+ results_table[virus] = []
102
+
103
+ host_table.each do |host, score|
104
+ scaled_score = klass.scale score, min, max, from, to
105
+
106
+ results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
107
+ end
108
+ end
109
+
110
+ results_table
111
+ end
112
+
6
113
  # This one's a bit different as it parses as well and returns original names.
7
114
  # @todo Also do the reverse of each genome in case it's a contig.
8
- def self.mummer exe, vir_dir, host_dir, outdir, threads
115
+ def self.mummer2 exe, vir_dir, host_dir, outdir, threads
9
116
  klass = Class.new.extend Rya::CoreExtensions::Math
10
117
  FileUtils.mkdir_p outdir
11
118
 
@@ -1,5 +1,5 @@
1
1
  module BigSimon
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
 
4
4
  COPYRIGHT = "2018 Ryan Moore"
5
5
  CONTACT = "moorer@udel.edu"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: big_simon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -111,7 +111,7 @@ email:
111
111
  - moorer@udel.edu
112
112
  executables:
113
113
  - big_simon
114
- - ranks
114
+ - consensus_predictions
115
115
  extensions: []
116
116
  extra_rdoc_files: []
117
117
  files:
@@ -129,7 +129,7 @@ files:
129
129
  - bin/console
130
130
  - bin/setup
131
131
  - exe/big_simon
132
- - exe/ranks
132
+ - exe/consensus_predictions
133
133
  - lib/big_simon.rb
134
134
  - lib/big_simon/parsers.rb
135
135
  - lib/big_simon/pipeline.rb